Skip to content

Instantly share code, notes, and snippets.

@supercaracal
Last active June 19, 2023 01:03
Show Gist options
  • Save supercaracal/6541ec281d1d59d5ebfb80d939e51219 to your computer and use it in GitHub Desktop.
Save supercaracal/6541ec281d1d59d5ebfb80d939e51219 to your computer and use it in GitHub Desktop.
An index of Linux Device Driver 3rd edition for my poor memory

Getting started

$ sudo apt install linux-source
$ cp /usr/src/linux-source-5.15.0/linux-source-5.15.0.tar.bz2 /path/to/where
$ tar jxf linux-source-5.15.0.tar.bz2
  ERROR: Kernel configuration is invalid.
         include/generated/autoconf.h or include/config/auto.conf are missing.
         Run 'make oldconfig && make prepare' on kernel src to fix it.
$ sudo apt install libelf-dev
$ make olddefconfig
$ make prepare
$ make modules_prepare

https://stackoverflow.com/a/18479126

Chapter 1

  • mechanism and policy
    • what capabilities are to be provided ... the mechanism
      • the X server
    • how those capabilities can be used ... the policy
      • the window and session managers
    • policy free
  • kernel
    • process management
    • memory management
    • filesystems
    • device control
    • networking
  • devices and modules
    • character devices
    • block devices
    • network devices
  • version numbering
    • even-numbered kernel versions are stable

Chapter 2

#include <linux/init.h>
#include <linux/module.h>
MODULE_LICENSE("GPL"); // avoid tainted module layout for kernel

static int hello_init(void)
{
  printk(KERN_ALERT "Hello, world\n");
  return 0;
}

static void hello_exit(void)
{
  printk(KERN_ALERT "Goodbye, cruel world\n");
}

module_init(hello_init);
module_exit(hello_exit);
root# tail -f /var/log/messages
root# insmod ./hello.ko
root# rmmod hello
  • event-driven
  • no libc included
  • not sequential, concurrent, asynchronous
  • current process:
#include <linux/sched.h>

printk(KERN_INFO "The process is \"%s\"(pid %d)\n", current->comm, current->pid);
  • kernel shared a small stack area, 4096-byte page, we should allocate dynamically if the variable size are large
  • __ prefixed functions are low-level component, we should use them carefully
  • cannot do floating point arithmetic
  • Readers is a make-like build tool for kernel
ifneq($(KERNELRELEASE),)
  obj-m := hello.o
else
  KERNELDIR ?= /lib/modules/$(shell uname -r)/build

default:
  $(MAKE) -C $(KERNELDIR) M=$(PWD) modules
endif
  • insmod calls sys_init_module function within kernel/module.c
  • system calls are prefixed with sys_
  • modprobe loads other modules that the module needs
  • lsmod lists the modules currently loaded
    • /proc/modules
    • /sys/
  • link the module against vermagic.o
  • linux/version.h
    • UTS_RELEASE ... string
    • LINUX_VERSION_CODE ... binary
    • KERNEL_VERSION(major,minor,release)
    • optimization each processor family
  • module stack
  • avoid symbol pollution
EXPORT_SYMBOL(name);
EXPORT_SYMBOL_GPL(name);
  • MODULE_*
    • MODULE_LICENSE
    • MODULE_AUTHOR
    • MODULE_DESCRIPTION
    • MODULE_VERSION
    • MODULE_ALIAS
    • MODULE_DEVICE_TABLE
  • grep EXPORT_SYMBOL or register_ for finding entrypoint of other device drivers
  • special modifiers
    • static int __init initialization_function(void)
    • static void __exit cleanup_function(void)
  • error handling
    • use goto statement as the best error-recovery tool for lessen cpu time
    • <linux/errno.h>
    • negative numbers
  • module-loading races
  • module parameters
    • /etc/modprobe.conf
    • moduleparam.h
    • bool, invbool, charp, int, long, short, uint, ulong, ushort
      • charp is a char pointer
    • $ insmod my_module param_one=foo param_two=7 param_three=a,b,c,d,e
static char *param_one = "baz";
static int param_two = 1;
static char **param_three = ["a", "b", "c"];

// the 3rd arg is a permission for sysfs, see <linux/stat.h>
// if the permission makes writable, the module should be able to detect it by self
module_param(param_one, charp, S_IRUGO);
module_param(param_two, int, S_IRUGO);
module_param_array(param_three, charp, 12, S_IRUGO);
  • user-space device driver
    • we can use the libc
    • as a server process
    • overhead of the context switch
    • require some privileges
    • e.g.
      • gadgetfs
      • the X server
      • the following user-space drivers relu on SCSI generic kernel-space driver
        • SCSI scanner drivers (in SANE pkg)
        • CD writers (in cdrecord pkg)
  • See /sys/module, /proc/modules

Chapter 3

  • charactor devices are identified by c like this:
$ ls -l /dev/ | head
total 0
crw-r--r-- 1 root root     10, 235 Feb 22 16:17 autofs
drwxr-xr-x 2 root root         580 Feb 22 16:17 block
drwxr-xr-x 2 root root         100 Feb 22 16:17 bsg
crw-rw---- 1 root disk     10, 234 Feb 22 16:17 btrfs-control
drwxr-xr-x 3 root root          60 Feb 22 16:17 bus
drwxr-xr-x 2 root root        2720 Feb 22 16:17 char
crw--w---- 1 root tty       5,   1 Feb 22 16:18 console
lrwxrwxrwx 1 root root          11 Feb 22 16:17 core -> /proc/kcore
crw------- 1 root root     10, 125 Feb 22 16:17 cpu_dma_latency
  • comma separated numbers are major and minor version
    • major numbers ... 1,4,7, and 10
    • minor numbers ... 1,3,5,64,65 and 129
  • one-major-one-driver principle
#include <linux/types.h>

MOJOR(dev_t dev);
MINOR(dev_t dev);
MKDEV(int major, int minor);
#include <linux/fs.h>

# it returns zero or a negative error code
int register_chrdev_region(dev_t first, unsigned int count, char *name);

# dynamically
int alloc_chrdev_region(dev_t *dev, unsigned int firstminor, unsigned int count, char *name);

void unregister_chrdev_region(dev_t first, unsigned int count);
struct file_operations scull_fops = {
  .owner = THIS_MODULE,
  .llseek = scull_llseek,
  .read = scull_read,
  .write = scull_write,
  .ioctl = scull_ioctl,
  .open = scull_open,
  .release = scull_release,
};
void cdev_init(struct cdev *, struct file_operations *);

// num is the first number of device, count is the number of associated with device
// cdev_add shoud be called when everything is ready
int cdev_add(struct cdev *, dev_t num, unsigned int count);

void cdev_del(struct cdev *);
  • registration, open, release, write, read
    • pread, pwrite ... they don't change the file position
    • readv, writev ... vector version
#include <linux/slab.h>

void *kmalloc(size_t size, int flags);
void kfree(void *ptr);
  • DO NOT DEREFERENCE DIRECTLY user space buffer
// signed size type
ssize_t read(struct file *filp, char __user *buff, size_t count, loff_t *offp);
ssize_t write(struct file *filp, const char __user *buff, size_t count, loff_t *offp);

// page cache consideration, swappable
// user-space pointer validation
#include <linux/uaccess.h>
unsigned long copy_to_user(void __user *to, const void *from, unsigned long count);
unsigned long copy_from_user(void *to, const void __user *from, unsigned long count);

Chapter 4

$ cat /proc/sys/kernel/printk
4       4       1       7

## current, default, minimum, boot-time default
int printk_ratelimit(void);

if (printk_ratelimit())
  printk(KERN_NOTICE "The printer is still on fire\n");
$ cat /proc/sys/kernel/printk_ratelimit
5 # the number of seconds to wait before re-enabling messages

$ cat /proc/sys/kernel/printk_ratelimit_burst
10 # the number of messages accepted before rate-limiting
  • printing device numbers
int print_dev_t(char *buffer, dev_t dev);
char *format_dev_t(char *buffer, dev_t dev);
int (*read_proc)(char *page, char **start, off_t offset, int count, int *eof, void *data);
struct proc_dir_entry *create_proc_read_entry(const char *name, mode_t mode, struct proc_dir_entry *base, read_proc_t *read_proc, void *data);
remove_proc_entry(const char *name, struct proc_dir_entry *base);
arg desc
page the buffer where you'll write your data
start is used by the function to say where the interesting data has been written in page, is useful if the buffer is greater than one page, may be NULL
offset same as the read function
count same as the read function
eof must be set by the driver to signal that it has no more data to return
data you can use for internal bookkeeping
arg desc
name the name of the file in /proc
mode the protection mask, zero is system-wide default
base the parent dir, NULL is /proc root
read_proc the function
data the client data, ignored by kernel
void *start(struct seq_file *sfile, loff_t *pos);
void *next(struct seq_file *sfile, void *v, loff_t *pos);
void stop(struct seq_file *sfile, void *v);
int show(struct seq_file *sfile, void *v);
// you should not use printk in show function, instead:

int seq_printf(struct seq_file *sfile, const char *fmt, ...);
int seq_putc(struct seq_file *sfile, char c);
int seq_puts(struct seq_file *sfile, const char *s);
int seq_escape(struct seq_file *sfile, const char *s, const char *esc);
int seq_path(struct seq_file *sfile, struct vfsmount *m, struct dentry *dentry, char *esc);
struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent);

entry->proc_fops = &your_fops;
  • use strace command
    • with useful options, such as -t, -T, -e, -o, ...
  • oops messages
    • https://github.com/torvalds/linux/blob/master/arch/um/kernel/trap.c
    • EIP ... the instruction pointer
    • on the x86 architecture, by default, the user-space stack starts just below 0xc0000000
      • the kernel-space stack starts at 0xc0000000
      • slab poisoning, the offending address is oxa5a5a5a5, you may forget to initialize dynamic memory address
  • system hangs
  • $ gdb /usr/src/linux/vmlinux /proc/kcore
    • uncompressed ELF kernel executable, not the zImage or bzImage or else
    • .txt, .bss, .data
    • /sys/module/*/sections
    • (gdb) print *(address)
  • kdb, kgdb
  • user-mode linux (UML)
    • has no access to the host's hardwares
  • linux trace toolkit (LTT)
  • dynamic probes (DProbes)
    • IBM

Chapter 5

  • SMP, symmetric multiprocessing
  • workqueues, tasklets, timers
  • against critical sections, use semaphore
    • mutex, mutual exclusion
  • holding, taken out, acquired a lock
#include <asm/semaphore.h>

void sema_init(struct semaphore *sem, int val);

DECLARE_MUTEX(name); // 1
DECLARE_MUTEX_LOCKED(name); // 0

// at runtime
void init_MUTEX(struct semaphore *sem);
void init_MUTEX_LOCKED(struct semaphore *sem);

void down(struct semaphore *sem);
int down_interruptible(struct semaphore *sem);
int down_trylock(struct semaphore *sem);

void up(struct semaphore *sem);
  • You should undo any user-vislble changes if you returns -ERESTARTSYS
    • if you cannot undo things, you should return -EINTR instead
  • rwsem (reader/writer semaphore)
#include <linux/rwsem.h>

void init_rwsem(struct rw_semaphore *sem);

void down_read(struct rw_semaphore *sem);
int down_read_trylock(struct rw_semaphore *sem);
void up_read(struct rw_semaphroe *sem);

void down_write(struct rw_semaphore *sem);
int down_write_trylock(struct rw_semaphore *sem);
void up_write(struct rw_semaphore *sem);
void downgrade_write(struct rw_semaphore *sem);
  • completion interface
#include <linux/completion.h>

DECLARE_COMPLETION(struct completion c);

init_completion(struct completion *c);

void wait_for_completion(struct completion *c);

void complete(struct completion *c);
void complete_all(struct completion *c);

INIT_COMPLETION(struct completion c); // reuse

void complete_and_exit(struct completion *c, long retval);
  • spinlocks
    • don't let a process into sleep
    • tight loop
    • avoid deadlocks in hyperthreaded processors
    • spinning forever in nonpreemptive uniprocessor
    • be carefully, pay attension, writing code that will execute under a spinlock, avoid sleep
    • be minimum holding time of lock
    • irq ... interrupt request
#include <linux/spinlock.h>

spinlock_t my_lock = SPIN_LOCK_UNLOCKED; // at compile time
void spin_lock_init(spinlock_t *lock); // at runtime

void spin_lock(spinlock_t *lock);
void spin_unlock(spinlock_t *lock);

void spin_lock(spinlock_t *lock);
void spin_lock_irqsave(spinlock_t *lock, unsigned long flags); // disables interrupts on the local processor only
void spin_lock_irq(spinlock_t *lock);
void spin_lock_bh(spinlock_t *lock); // disables software interrupts, but leaves hardware interrupts

void spin_unlock(spinlock_t *lock);
void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags);
void spin_unlock_irq(spinlock_t *lock);
void spin_unlock_bh(spinlock_t *lock);

int spin_trylock(spinlock_t *lock);
int spin_trylock_bh(spinlock_t *lock);

rwlock_t my_rwlock = RW_LOCK_UNLOCKED; // static
void rwlock_init(rwlock_t *); // dynamic
...

Chapter 6

  • ioctl
// iotcl(2) in user-space
int ioctl(int fd, unsigned long cmd, ...);

// in kernel-space
int (*ioctl)(struct inode *inode, struct fule *filp, unsigned int cmd, unsigned long arg);
int access_ok(int type, const void *addr, unsigned long size);

// e.g.
if (!access_ok(VERIFY_WRITE, addr, sizeof(int))) {
  return -EFAULT;
}

// faster than copy_{from|to}_user functions
// but, if you receive a message like "conversion to non-scalar type requested" from the compiler,
//   you should use copy_{from|to}_user functions insted
put_user(datum, ptr); // calling access_ok internally
__put_user(datum, ptr);
get_user(local, ptr);
__get_user(local, ptr);
#include <linux/sched.h>
int capable(int capability);

// e.g.
if (!capable(CAP_SYS_ADMIN)) {
  return -EPERM;
}
  • safe manners
    • you never sleep when you are running in an atomic context
    • you never know how long your process may have been out of the CPU or what may have changed in the mean time
      • you must check to ensure that the condition you were waiting for is, indeed, true
    • use wait queue
#include <linux/wait.h>

DECLARE_WAIT_QUEUE(name); // static
init_waitqueue_head(wait_queue_head_t *); // dynamic
struct scull_pipe {
  wait_queue_head_t inq, outq;       /* read and write queues */
  char *buffer, *end;                /* begin of buf, end of buf */
  int buffersize;                    /* used in pointer arithmetic */
  char *rp, *wp;                     /* where to read, where to write */
  int nreaders, nwriters;            /* number of openings for r/w */
  struct fasync_struct *async_queue; /* asynchronous readers */
  struct semaphore sem;              /* mutual exclusion semaphore */
  struct cdev cdev;                  /* Char device structure */
};

static ssize_t scull_p_read (struct file *filp, char _ _user *buf, size_t count, loff_t *f_pos) {
  struct scull_pipe *dev = filp->private_data;

  if (down_interruptible(&dev->sem)) {
    return -ERESTARTSYS;
  }

  while (dev->rp == dev->wp) { /* nothing to read */
    up(&dev->sem); /* release the lock */
    if (filp->f_flags & O_NONBLOCK) {
      return -EAGAIN;
    }

    PDEBUG("\"%s\" reading: going to sleep\n", current->comm);
    if (wait_event_interruptible(dev->inq, (dev->rp != dev->wp))) {
      return -ERESTARTSYS; /* signal: tell the fs layer to handle it */
    }

    /* otherwise loop, but first reacquire the lock */
    if (down_interruptible(&dev->sem)) {
      return -ERESTARTSYS;
    }
  }

  /* ok, data is there, return something */
  if (dev->wp > dev->rp) {
    count = min(count, (size_t)(dev->wp - dev->rp));
  } else /* the write pointer has wrapped, return data up to dev->end */ {
    count = min(count, (size_t)(dev->end - dev->rp));
  }

  if (copy_to_user(buf, dev->rp, count)) {
    up (&dev->sem);
    return -EFAULT;
  }

  dev->rp += count;
  if (dev->rp == dev->end) {
    dev->rp = dev->buffer; /* wrapped */
  }

  up(&dev->sem);

  /* finally, awake any writers and return */
  wake_up_interruptible(&dev->outq);
  PDEBUG("\"%s\" did read %li bytes\n",current->comm, (long)count);
  return count;
}
void set_current_state(int new_state);
current->state = TASK_INTERRUPTIBLE; // discouraged manner

if (!condition) {
  schedule();
}

DEFINE_WAIT(my_wait);
init_wait(*wait_queue_t);
void prepare_to_wait(wait_queue_head_t *queue, wait_queue_t *wait, int state);
void finish_wait(wait_queue_head_t *queue, wait_queue_t *wait);
/* Wait for space for writing; caller must hold device semaphore.  On
 * error the semaphore will be released before returning. */
static int scull_getwritespace(struct scull_pipe *dev, struct file *filp) {
  while (spacefree(dev) == 0) { /* full */
    DEFINE_WAIT(wait);

    up(&dev->sem);
    if (filp->f_flags & O_NONBLOCK) {
      return -EAGAIN;
    }

    PDEBUG("\"%s\" writing: going to sleep\n", current->comm);
    prepare_to_wait(&dev->outq, &wait, TASK_INTERRUPTIBLE);
    if (spacefree(dev) == 0) {
      schedule();
    }

    finish_wait(&dev->outq, &wait);
    if (signal_pending(current)) {
      return -ERESTARTSYS; /* signal: tell the fs layer to handle it */
    }
    if (down_interruptible(&dev->sem)) {
      return -ERESTARTSYS;
    }
  }

  return 0;
}
void prepare_to_wait_exclusive(wait_queue_head_t *queue, wait_queue_t *wait, int state);
unsigned int (*poll)(struct file *filep, poll_table *wait);
void poll_wait(struct file *, wait_queue_head_t *, poll_table *);
static unsigned int scull_p_poll(struct file *filp, poll_table *wait) {
  struct scull_pipe *dev = filp->private_data;
  unsigned int mask = 0;

  /*
   * The buffer is circular; it is considered full
   * if "wp" is right behind "rp" and empty if the
   * two are equal.
   */
  down(&dev->sem);
  poll_wait(filp, &dev->inq,  wait);
  poll_wait(filp, &dev->outq, wait);
  if (dev->rp != dev->wp) {
    mask |= POLLIN | POLLRDNORM;    /* readable */
  }

  if (spacefree(dev)) {
    mask |= POLLOUT | POLLWRNORM;   /* writable */
  }

  up(&dev->sem);
  return mask;
}
  • flushing
// datasync is used to distinguish between fsync and fdatasync system calls, for filesystem code
int (*fsync)(struct file *file, struct dentry *dentry, int datasync);
signal(SIGIO, &input_handler); /* dummy sumple; sigaction() is better */
fcntl(STDIN_FILENO, F_SETOWN, getpid());
oflags = fcntl(STDIN_FILENO, F_GETFL);
fcntl(STDIN_FILENO, F_SETFL, oflags | FASYNC);
#include <linux/fs.h>

int fasync_helper(int fd, struct file *filp, int mode, struct fasync_struct **fa);
void kill_fasync(struct fasync_struct **fa, int sig, int band);
static int scull_p_fasync(int fd, struct file *filp, int mode) {
  struct scull_pipe *dev = filp->private_data;
  return fasync_helper(fd, filp, mode, &dev->async_queue);
}
if (dev->async_queue) {
  kill_async(&dev->async_queue, SIGIO, POLL_IN);
}
/* remove this filp from the asynchronously notified filp's */
scull_p_fasync(-1, filp, 0);
  • seeking
loff_t scull_llseek(struct file *filp, loff_t off, int whence) {
  struct scull_dev *dev = filp->private_data;
  loff_t newpos;

  switch(whence) {
    case 0: /* SEEK_SET */
      newpos = off;
      break;
    case 1: /* SEEK_CUR */
      newpos = filp->f_pos + off;
      break;
    case 2: /* SEEK_END */
      newpos = dev->size + off;
      break;
    default: /* can't happen */
      return -EINVAL;
  }

  if (newpos < 0) {
    return -EINVAL;
  }

  filp->f_pos = newpos;
  return newpos;
}
// inform the kernel that your device does not support llseek
int nonseekable_open(struct inode *inode, struct file *filp);

// Also, you should point the no_llseek func to your file_operations structures
  • access control
    • current->uid, current->euid, capable(CAP_DAC_OVERRIDE)
    • linux tape driver helps your understanding of access control
      • also, /dev/tty is too
      • software devices, virtual devices

Chapter 7

#include <linux/jiffies.h>

unsigned log j, stamp_1, stamp_half, stamp_n;

j = jiffies; /* read the current value */
stamp_1 = j + HZ; /* 1 second in the future */
stamp_half = j + HZ / 2; /* half a second */
stamp_n = j + n * HZ / 1000; /* n milliseconds */

int time_after(unsigned long a, unsigned long b); /* true when a < b */
int time_before(unsigned long a, unsigned long b);
int time_after_eq(unsigned long a, unsigned long b); /* after or equal */
int time_before_eq(unsigned long a, unsigned long b); /* before or equal */
#include <linux/time.h>

// newer
unsigned long timespec_to_jiffies(struct timespec *value);
void jiffies_to_timespec(unsigned long jiffies, struct timespec *value);

// older and popular
unsigned long timeval_to_jiffies(struct timeval *value);
void jiffies_to_timeval(unsigned long jiffies, struct timeval *value);
unsigned long mktime(unsigned int year, unsigned int mon, unsigned int day, unsigned int hour, unsigned int min, unsigned int sec);
void do_gettimeofday(struct timeval *tv);
struct timespec current_kernel_time(void);
#include <linux/sched.h>

set_current_state(TASK_INTERRUPTIBLE);
signed long schedule_timeout(signed long timeout);
#include <linux/delay.h>
void ndelay(unsigned long nsecs);
void udelay(unsigned long usecs);
void mdelay(unsigned long msecs);
void msleep(unsigned int millisecs);
unsigned long msleep_interruptible(unsigned int millisecs);
void ssleep(unsigned int seconds)
#include <linux/timer.h>

struct timer_list {
  /* ... */
  unsigned long expires;
  void (*function)(unsigned long);
  unsigned long data;
};

void init_timer(struct timer_list *timer); // dynamic
struct timer_list TIMER_INITIALIZER(_function, _expires, _data); // static

void add_timer(struct timer_list * timer);
int del_timer(struct timer_list * timer);
  • tasklets
    • .
#include <linux/interrupt.h>

struct tasklet_struct {
  /* ... */
  void (*func)(unsigned long);
  unsigned long data;
};

void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data);
DECLARE_TASKLET(name, func, data);
DECLARE_TASKLET_DISABLED(name, func, data);
struct workqueue_struct *create_workqueue(const char *name);
struct workqueue_struct *create_singlethread_workqueue(const char *name);

// static
DECLARE_WORK(name, void (*function)(void *), void *data);

// dynamic
INIT_WORK(struct work_struct *work, void (*function)(void *), void *data);
PREPARE_WORK(struct work_struct *work, void (*function)(void *), void *data);

int queue_work(struct workqueue_struct *queue, struct work_struct *work);
int queue_delayed_work(struct workqueue_struct *queue, struct work_struct *work, unsigned long delay);

int cancel_delayed_work(struct work_struct *work);
void flush_workqueue(struct workqueue_struct *queue);
void destroy_workqueue(struct workqueue_struct *queue);
int schedule_work(struct work_struct *work);
int schedule_delayed_work(struct work_struct *work, unsigned long delay);
int cancel_delayed_work(struct work_struct *work);
void flush_scheduled_work(void);
static struct work_struct my_work;

INIT_WORK(&my_work, my_func, &my_data);
prepare_to_wait(&my_wait, &wait, TASK_INTERRUPTIBLE);
schedule_work(&my_work);
schedule();
finish_wait(&my_wait, &wait);

Chapter 8

#include <linux/slab.h>

void *kmalloc(size_t size, int flags);
kmem_cache_t *kmem_cache_create(
  const char *name,
  size_t size,
  size_t offset,
  unsigned long flags,
  void (*constructor)(void *, kmem_cache_t *, unsigned long flags),
  void (*destructor)(void *, kmem_cache_t *, unsigned long flags)
);

void *kmem_cache_alloc(kmem_cache_t *cache, int flags);
void kmem_cache_free(kmem_cache_t *cache, const void *obj);
int kmem_cache_destroy(kmem_cache_t *cache);
mempool_t *mempool_create(int min_nr,  mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data);

typedef void *(mempool_alloc_t)(int gfp_mask, void *pool_data);
typedef void (mempool_free_t)(void *element, void *pool_data);

cache = kmem_cache_create(. . .);
pool = mempool_create(MY_POOL_MINIMUM, mempool_alloc_slab, mempool_free_slab, cache);

void *mempool_alloc(mempool_t *pool, int gfp_mask);
void mempool_free(void *element, mempool_t *pool);
int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
void mempool_destroy(mempool_t *pool);
  • to allocate big chunks of memory
get_zeroed_page(unsigned int flags);
__get_free_page(unsigned int flags);
__get_free_pages(unsigned int flags, unsigned int order); // log2N, 1 page is 0, 8 pages are 3

void free_page(unsigned long addr);
void free_pages(unsigned long addr, unsigned long order);
#include <linux/vmalloc.h>

void *vmalloc(unsigned long size);
void vfree(void * addr);
void *ioremap(unsigned long offset, unsigned long size);
void iounmap(void * addr);
$ less /proc/kallsyms
// at a compile time
DEFINE_PER_CPU(int[3], my_percpu_array);
DEFINE_PER_CPU(type, name);

// at a runtime
void *alloc_percpu(type);
void *_ _alloc_percpu(size_t size, size_t align);

get_cpu_var(sockets_in_use)++;
put_cpu_var(sockets_in_use);
per_cpu(variable, int cpu_id);
per_cpu_ptr(void *per_cpu_var, int cpu_id);

EXPORT_PER_CPU_SYMBOL(per_cpu_var);
EXPORT_PER_CPU_SYMBOL_GPL(per_cpu_var);
DECLARE_PER_CPU(type, name);
#include <linux/bootmem.h>
void *alloc_bootmem(unsigned long size);
void *alloc_bootmem_low(unsigned long size);
void *alloc_bootmem_pages(unsigned long size);
void *alloc_bootmem_low_pages(unsigned long size);
void free_bootmem(unsigned long addr, unsigned long size);

Chapter 9

  • hardware registers and memory (RAM)
    • most PCI devices map registers into a memory address region
  • reordering CPU instractions by compiler optimization
    • prevent to reorder and cache
    • a memory barrier
    • in x86, since writes outside the processor are not reorderd, reads are reordered
      • wmb() does nothing
      • mb() is slower than wmb()
#include <linux/kernel.h>
void barrier(void)
#include <asm/system.h>
void rmb(void); 
void read_barrier_depends(void); 
void wmb(void); 
void mb(void);
void smp_rmb(void); 
void smp_read_barrier_depends(void); 
void smp_wmb(void); 
void smp_mb(void);
writel(dev->registers.addr, io_destination_address);
writel(dev->registers.size, io_size);
writel(dev->registers.operation, DEV_READ);
wmb();
writel(dev->registers.control, DEV_GO);
#define set_mb(var, value)  do {var = value; mb();}  while 0
#define set_wmb(var, value) do {var = value; wmb();} while 0
#define set_rmb(var, value) do {var = value; rmb();} while 0
#include <linux/ioport.h>
struct resource *request_region(unsigned long first, unsigned long n, const char *name);
void release_region(unsigned long start, unsigned long n);

// deprecated, not in atomic manner
int check_region(unsigned long first, unsigned long n);
$ cat /proc/ioports
0000-0000 : dma1
0000-0000 : pic1
0000-0000 : timer0
0000-0000 : timer1
0000-0000 : keyboard
0000-0000 : keyboard
0000-0000 : rtc0
0000-0000 : dma page reg
0000-0000 : pic2
0000-0000 : dma2
0000-0000 : fpu
0000-0000 : ACPI PM1a_EVT_BLK
0000-0000 : ACPI PM1a_CNT_BLK
0000-0000 : ACPI PM_TMR
0000-0000 : ACPI GPE0_BLK
// for 8-bit
unsigned inb(unsigned port);
void outb(unsigned char byte, unsigned port);

// for 16-bit
unsigned inw(unsigned port);
void outw(unsigned short word, unsigned port);

// for 32-bit
unsigned inl(unsigned port);
void outl(unsigned longword, unsigned port);

// there is no 64-bit functions
  • string operations
    • keep in mind that byte-ordering
    • also, needs pausing I/O with inb_p, outb_p and so on, prevent to overclock processors
// 8-bit
void insb(unsigned port, void *addr, unsigned long count);
void outsb(unsigned port, void *addr, unsigned long count);

// 16-bit
void insw(unsigned port, void *addr, unsigned long count);
void outsw(unsigned port, void *addr, unsigned long count);

// 32-bit
void insl(unsigned port, void *addr, unsigned long count);
void outsl(unsigned port, void *addr, unsigned long count);
  • the parallel port
    • ECP, EPP modes
    • 8-bit
    • 0x378, 0x278
    • first port
      • bidirectional data register
      • it connects directly to pins 2-9 on the physical connector
    • second port
      • read-only status register
      • e.g. printer, online, out of paper, busy
    • third port
      • output-only control register
      • interrupts
    • TTL, transister-transister logic
while (count--) {
    outb(*(ptr++), port);
    wmb();
}
  • I/O memory
#include <linux/ioport.h>

struct resource *request_mem_region(unsigned long start, unsigned long len, char *name);
void release_mem_region(unsigned long start, unsigned long len);

int check_mem_region(unsigned long start, unsigned long len); // deprecated, unsafe, old
#include <asm/io.h>
void *ioremap(unsigned long phys_addr, unsigned long size);
void *ioremap_nocache(unsigned long phys_addr, unsigned long size);
void iounmap(void *addr);

unsigned int ioread8(void *addr);
unsigned int ioread16(void *addr);
unsigned int ioread32(void *addr);

void iowrite8(u8 value, void *addr);
void iowrite16(u16 value, void *addr);
void iowrite32(u32 value, void *addr);

void ioread8_rep(void *addr, void *buf, unsigned long count);
void ioread16_rep(void *addr, void *buf, unsigned long count);
void ioread32_rep(void *addr, void *buf, unsigned long count);
void iowrite8_rep(void *addr, const void *buf, unsigned long count);
void iowrite16_rep(void *addr, const void *buf, unsigned long count);
void iowrite32_rep(void *addr, const void *buf, unsigned long count);

void memset_io(void *addr, u8 value, unsigned int count);
void memcpy_fromio(void *dest, void *source, unsigned int count);
void memcpy_toio(void *dest, void *source, unsigned int count);
$ cat /proc/iomem | head -n 2
00000000-00000000 : Reserved
00000000-00000000 : System RAM
  • ports as I/O memory
// request region in advance
void *ioport_map(unsigned long port, unsigned int count);
void ioport_unmap(void *addr);
while (count--) {
    iowrite8(*ptr++, address);
    wmb();
}
#define ISA_BASE    0xA0000
#define ISA_MAX     0x100000  /* for general memory access */

/* this line appears in silly_init */
io_base = ioremap(ISA_BASE, ISA_MAX - ISA_BASE);

Chapter 10

int request_irq(unsigned int irq,
                irqreturn_t (*handler)(int, void *, struct pt_regs *),
                unsigned long flags, 
                const char *dev_name,
                void *dev_id);

void free_irq(unsigned int irq, void *dev_id);

int can_request_irq(unsigned int irq, unsigned long flags);
$ cat /proc/interrupts
           CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7
  8:          0          0          0          0          0          0          0          0   IO-APIC   8-edge      rtc0
  9:          6          0          0          0          0          0          0          0   IO-APIC   9-fasteoi   acpi
 24:          0          1          0          0          0          0          0          0  Hyper-V PCIe MSI 2818572288-edge      virtio0-config
 25:          0          0       1327          0          0          0          0          0  Hyper-V PCIe MSI 2818572289-edge      virtio0-virtqueues
 26:          0          0          0          0          0          0          0          0  Hyper-V PCIe MSI 1879048192-edge      virtio1-config
 27:          0          0          0          0          0        181          0          0  Hyper-V PCIe MSI 1879048193-edge      virtio1-requests
 28:          0          0          0          0          0          0          1          0  Hyper-V PCIe MSI 3221225472-edge      virtio2-config
 29:          0          0          0          0          0          0          0          1  Hyper-V PCIe MSI 3221225473-edge      virtio2-hiprio
 30:         10          0          0          0          0          0          0          0  Hyper-V PCIe MSI 3221225474-edge      virtio2-requests.0
 31:          0          0          0          0          0          0          0          0  Hyper-V PCIe MSI 2684354560-edge      virtio3-config
 32:          0          0          3          0          0          0          0          0  Hyper-V PCIe MSI 2684354561-edge      virtio3-requests
 33:          0          0          0          0          0          0          0          0  Hyper-V PCIe MSI 2818572288-edge      virtio4-config
 34:          0          0          0          0          0          0          0          0  Hyper-V PCIe MSI 2818572289-edge      virtio4-requests
NMI:          0          0          0          0          0          0          0          0   Non-maskable interrupts
LOC:          0          0          0          0          0          0          0          0   Local timer interrupts
SPU:          0          0          0          0          0          0          0          0   Spurious interrupts
PMI:          0          0          0          0          0          0          0          0   Performance monitoring interrupts
IWI:          1          0          0          0          0          0          0          0   IRQ work interrupts
RTR:          0          0          0          0          0          0          0          0   APIC ICR read retries
RES:       3730       2322       4591       2898       4379       2173       4061       2478   Rescheduling interrupts
CAL:     134428     144302     142620     104106     137352      93561     135487     132745   Function call interrupts
TLB:          0          0          0          0          0          0          0          0   TLB shootdowns
TRM:          0          0          0          0          0          0          0          0   Thermal event interrupts
HYP:     154677      77066       1558       5878       3027        663      11278        759   Hypervisor callback interrupts
HRE:          0          0          0          0          0          0          0          0   Hyper-V reenlightenment interrupts
HVS:     512616     231656     458997     262581     432344     199343     486263     194674   Hyper-V stimer0 interrupts
ERR:          0
MIS:          0
PIN:          0          0          0          0          0          0          0          0   Posted-interrupt notification event
NPI:          0          0          0          0          0          0          0          0   Nested posted-interrupt event
PIW:          0          0          0          0          0          0          0          0   Posted-interrupt wakeup event
$ cat /proc/stat
cpu  248635 645 41321 68260050 7161 0 4129 0 0 0
cpu0 31514 84 6070 8529837 756 0 2522 0 0 0
cpu1 38679 64 4976 8524547 2117 0 1095 0 0 0
cpu2 30271 35 6199 8531722 904 0 236 0 0 0
cpu3 29410 100 4639 8535816 404 0 127 0 0 0
cpu4 33538 106 6229 8528750 584 0 43 0 0 0
cpu5 23946 170 3183 8542885 445 0 51 0 0 0
cpu6 32950 75 6566 8528753 808 0 43 0 0 0
cpu7 28327 11 3459 8537737 1140 0 12 0 0 0
intr 1088268 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1327 0 181 1 1 10 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ctxt 5650638
btime 1681698117
processes 20053
procs_running 1
procs_blocked 0
softirq 5528223 0 1069762 5 38472 0 0 232517 1591526 103 2595838
#include <linux/interrupt.h>
unsigned long probe_irq_on(void);
int probe_irq_off(unsigned long);
irqreturn_t short_interrupt(int irq, void *dev_id, struct pt_regs *regs) {
    struct timeval tv;
    int written;

    do_gettimeofday(&tv);

    /* Write a 16 byte record. Assume PAGE_SIZE is a multiple of 16 */
    written = sprintf(
        (char *)short_head,"%08u.%06u\n",
        (int)(tv.tv_sec % 100000000),
        (int)(tv.tv_usec)
    );
    BUG_ON(written != 16);
    short_incr_bp(&short_head, written);
    wake_up_interruptible(&short_queue); /* awake any reading process */
    return IRQ_HANDLED;
}

static inline void short_incr_bp(volatile unsigned long *index, int delta) {
    unsigned long new = *index + delta;
    barrier();  /* Don't optimize these two together */
    *index = (new >= (short_buffer + PAGE_SIZE)) ? short_buffer : new;
}
  • PIC ... programmable interrupt controller
  • splitting the interruput handler into two halves
    • top-half handler
      • responds to the interrupt
    • bottom-half handler
      • is scheduled by top-half handler and is executed later at a safer time
      • tasklets, workqueues
  • shared interrupts
    • a shared handler must be able to recognize its own interrupts and should quickly exit by returning IRQ_NONE when its own device has not interrupt

Chapter 11

#include <asm/page.h>
int order = get_order(16*1024);
buf = get_free_pages(GFP_KERNEL, order);
u32 cpu_to_le32 (u32);
u32 le32_to_cpu (u32);
#include <asm/unaligned.h>
get_unaligned(ptr);
put_unaligned(val, ptr);
void *ERR_PTR(long error);
long IS_ERR(const void *ptr);
long PTR_ERR(const void *ptr);
struct list_head {
    struct list_head *next, *prev;
};

// embed the above like this:
struct todo_struct {
    struct list_head list;
    int priority; /* driver specific */
    /* ... add other driver-specific fields */
};

struct list_head todo_list;
INIT_LIST_HEAD(&todo_list);  // runtime
LIST_HEAD(todo_list);  // compile time

Chapter 12

  • PCI ... Perripheral Component Interconnect
    • better performance, higher clock rate than ISA
    • jumperless, autodetected at boot time
    • identified by a bus number, a device number and a function number
$ lspci
18b5:00:00.0 System peripheral: Red Hat, Inc. Virtio file system (rev 01)
1fe5:00:00.0 3D controller: Microsoft Corporation Device 008e
538d:00:00.0 SCSI storage controller: Red Hat, Inc. Virtio filesystem (rev 01)
aec5:00:00.0 SCSI storage controller: Red Hat, Inc. Virtio filesystem (rev 01)
c2a1:00:00.0 SCSI storage controller: Red Hat, Inc. Virtio console (rev 01)
c546:00:00.0 SCSI storage controller: Red Hat, Inc. Virtio filesystem (rev 01)
$ ll /proc/bus/pci/
total 0
dr-xr-xr-x 9 root root 0 Apr 27 12:44 ./
dr-xr-xr-x 4 root root 0 Apr 27 12:44 ../
dr-xr-xr-x 3 root root 0 Apr 27 12:44 18b5:00/
dr-xr-xr-x 3 root root 0 Apr 27 12:44 1fe5:00/
dr-xr-xr-x 3 root root 0 Apr 27 12:44 538d:00/
dr-xr-xr-x 3 root root 0 Apr 27 12:44 aec5:00/
dr-xr-xr-x 3 root root 0 Apr 27 12:44 c2a1:00/
dr-xr-xr-x 3 root root 0 Apr 27 12:44 c546:00/
-r--r--r-- 1 root root 0 Apr 27 12:44 devices
$ cat /proc/bus/pci/devices | cut -f1
0000
0000
0000
0000
0000
0000
$ tree /sys/bus/pci/devices/
/sys/bus/pci/devices/
├── 18b5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/46d9ad56-18b5-4d77-b62f-0b5891160bb1/pci18b5:00/18b5:00:00.0
├── 1fe5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/502c91c3-1fe5-4692-9534-b0c68e0b8170/pci1fe5:00/1fe5:00:00.0
├── 538d:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/400930b1-538d-4d70-9538-26c4b4e104a7/pci538d:00/538d:00:00.0
├── aec5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/0399541e-aec5-4829-b1f1-6154168c0f99/pciaec5:00/aec5:00:00.0
├── c2a1:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/8ff4cdbd-c2a1-488e-8af1-b07e7ba7d476/pcic2a1:00/c2a1:00:00.0
└── c546:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/9fb76897-c546-420b-9873-997bc89c414d/pcic546:00/c546:00:00.0

6 directories, 0 files
struct pci_device_id foo;
// specify PCI_ANY_ID if a driver can handle any type of it

PCI_DEVICE(vendor, device);
PCI_DEVICE_CLASS(device_class, device_class_mask);
  • PCI hotplug
  • /lib/modules/KERNEL_VERSION/modules.pcimap
    • MODULE_DEVICE_TABLE(pci, i810_ids)
  • registering a PCI driver
    • name
    • id_table
    • probe()
    • remove()
    • suspend() ... optional
    • resume() ... optional
static struct pci_driver pci_driver = {
    .name = "pci_skel",
    .id_table = ids,
    .probe = probe,
    .remove = remove,
};
static int __init pci_skel_init(void) {
    return pci_register_driver(&pci_driver);
}
static void __exit pci_skel_exit(void) {
    pci_unregister_driver(&pci_driver);
}
  • enabling the PIC device
int pci_enable_device(struct pci_dev *dev);
int pci_read_config_byte(struct pci_dev *dev, int where, u8 *val);
int pci_read_config_word(struct pci_dev *dev, int where, u16 *val); // byte-order
int pci_read_config_dword(struct pci_dev *dev, int where, u32 *val); // byte-order

int pci_write_config_byte(struct pci_dev *dev, int where, u8 val);
int pci_write_config_word(struct pci_dev *dev, int where, u16 val); // byte-order
int pci_write_config_dword(struct pci_dev *dev, int where, u32 val); // byte-order
static unsigned char skel_get_revision(struct pci_dev *dev) {
    u8 revision;

    pci_read_config_byte(dev, PCI_REVISION_ID, &revision);
    return revision;
}
unsigned long pci_resource_start(struct pci_dev *dev, int bar);
unsigned long pci_resource_end(struct pci_dev *dev, int bar);
unsigned long pci_resource_flags(struct pci_dev *dev, int bar);
  • PCI interrupts
result = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &myirq);
if (result) {
    /* deal with error */
}
  • hardware abstraction
    • object oriented layout
    • like the file operations
    • the usual structure containing methods
    • it adds just the minimal overhead of dereferencing a pointer to the normal overhead of a function call
struct pci_ops {
    int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val);
    int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val);
};
dev->bus->ops->read(bus, devfn, where, 8, val);

Chapter 13

  • USB ... universal serial bus
    • User
    • Kernel
      • VFS layers, block layers, net layers, char layers, TTY layers, ...
      • USB device drivers
      • USB core
      • USB host controllers
    • Hardware
  • https://www.usb.org/
  • Device
    • Config
      • Interface <- USB driver
        • Endpoint
        • Endpoint
        • Endpoint
      • Interface <- USB driver
        • Endpoint
        • Endpoint
        • Endpoint
  • Endpoint
    • OUT endpoint
      • host -> device
    • IN endpoint
      • device -> host
    • unidirectional
    • four types
      • CONTROL
        • endpoint 0
      • INTERRUPT
        • keyboards, mice
      • BULK
        • printers, storage, network devices
      • ISOCHRONOUS
        • audio, video devices
struct usb_host_endpoint foo;
struct usb_endpoint_descriptor bar;
  • each USB driver controls an interface
    • e.g. Linux needs two diferent drivers for one speakers
struct usb_interface baz;
struct usb_host_config zap;
struct usb_device quax;
$ find /sys/devices/ -name *usb*
/sys/devices/platform/vhci_hcd.0/usbip_debug
/sys/devices/platform/vhci_hcd.0/usb1
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port3
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port1
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port8
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port6
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port4
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port2
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port7
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port5
/sys/devices/platform/vhci_hcd.0/usb2
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port7
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port7/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port5
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port5/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port3
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port3/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port1
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port1/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port8
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port8/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port6
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port6/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port4
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port4/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port2
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port2/usb3_lpm_permit
root_hub-hub_port:config.interface
$ lsusb
Bus 002 Device 001: ID 1d6b:0003 Linux Foundation 3.0 root hub
Bus 001 Device 001: ID 1d6b:0002 Linux Foundation 2.0 root hub
  • urb ... USB request block
    • https://github.com/torvalds/linux/blob/master/include/linux/usb.h
    • Created by a USB device driver.
    • Assigned to a specific endpoint of a specific USB device.
    • Submitted to the USB core, by the USB device driver.
    • Submitted to the specific USB host controller driver for the specified device by the USB core.
    • Processed by the USB host controller driver that makes a USB transfer to the device.
    • When the urb is completed, the USB host controller driver notifies the USB device driver.
  • do not create urb statically
    • you can only use the following constructor, to prevent internal counter of references
struct urb *usb_alloc_urb(int iso_packets, int mem_flags);
void usb_free_urb(struct urb *urb);
void usb_fill_int_urb(
  struct urb *urb,
  struct usb_device *dev,
  unsigned int pipe,
  void *transfer_buffer,
  int buffer_length,
  usb_complete_t complete,
  void *context,
  int interval
);

void usb_fill_bulk_urb(
  struct urb *urb,
  struct usb_device *dev,
  unsigned int pipe,
  void *transfer_buffer,
  int buffer_length,
  usb_complete_t complete,
  void *context
);

void usb_fill_control_urb(
  struct urb *urb,
  struct usb_device *dev,
  unsigned int pipe,
  unsigned char *setup_packet,
  void *transfer_buffer,
  int buffer_length,
  usb_complete_t complete,
  void *context
);
// isochronous urbs are only initialized by hand like this:
urb->dev = dev;
urb->context = uvd;
urb->pipe = usb_rcvisocpipe(dev, uvd->video_endp-1);
urb->interval = 1;
urb->transfer_flags = URB_ISO_ASAP;
urb->transfer_buffer = cam->sts_buf[i];
urb->complete = konicawc_isoc_irq;
urb->number_of_packets = FRAMES_PER_DESC;
urb->transfer_buffer_length = FRAMES_PER_DESC;
for (j=0; j < FRAMES_PER_DESC; j++) {
  urb->iso_frame_desc[j].offset = j;
  urb->iso_frame_desc[j].length = 1;
}
  • submitting urbs
int usb_submit_urb(struct urb *urb, int mem_flags);
// GFP_ATOMIC, GFP_NOIO, GFP_KERNEL
  • cancelling urbs
int usb_kill_urb(struct urb *urb);
int usb_unlink_urb(struct urb *urb);
/* table of devices that work with this driver */
static struct usb_device_id skel_table[] = {
    { USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) },
    {}  /* Terminating entry */
};

MODULE_DEVICE_TABLE(usb, skel_table);
static struct usb_device_id usb_ids[] = {
    {.driver_info = 42},
    {}  /* Terminating entry */
};
static struct usb_driver skel_driver = {
    .owner = THIS_MODULE,
    .name = "skeleton",
    .id_table = skel_table,
    .probe = skel_probe,
    .disconnect = skel_disconnect,
};
static int __init usb_skel_init(void) {
    int result;

    /* register this driver with the USB subsystem */
    result = usb_register(&skel_driver);
    if (result) {
        err("usb_register failed. Error number %d", result);
    }

    return result;
}

static void __exit usb_skel_exit(void) {
    /* deregister this driver with the USB subsystem */
    usb_deregister(&skel_driver);
}
  • also, you can transfer data without urbs via some helper functions
    • usb_bulk_msg
    • usb_control_msg
    • usb_get_descriptor
    • usb_string, usb_get_string
    • etc...

Chapter 14

struct cdev {
    struct kobject kobj;
    struct module *owner;
    struct file_operations *ops;
    struct list_head list;
    dev_t dev;
    unsigned int count;
};

// back-casting
struct cdev *device = container_of(kp, struct cdev, kobj);

// initializing, you should do zeroing in advance
// side-effect: increments the reference count
void kobject_init(struct kobject *kobj);

int kobject_set_name(struct kobject *kobj, const char *format, ...);
// returnts the address or NULL, increments the reference count
struct kobject *kobject_get(struct kobject *kobj);

// decrements the reference count
void kobject_put(struct kobject *kobj);
  • ktype
struct kobj_type {
    void (*release)(struct kobject *);
    struct sysfs_ops *sysfs_ops;
    struct attribute **default_attrs;
};

struct kobj_type *get_ktype(struct kobject *kobj);
int kobject_add(struct kobject *kobj);
void kobject_del(struct kobject *kobj);

// kobject_init and kobject_add
extern int kobject_register(struct kobject *kobj);

// kobject_del and kobject_put
kobject_unregister
void kset_init(struct kset *kset);
int kset_add(struct kset *kset);
int kset_register(struct kset *kset);
void kset_unregister(struct kset *kset);
struct kset *kset_get(struct kset *kset);
void kset_put(struct kset *kset);
kobject_set_name(&my_set->kobj, "The name");
  • subsystems
    • block_subsys /sys/block
    • device_subsys /sys/devices
struct subsystem {
    struct kset kset;
    struct rw_semaphore rwsem;
};

decl_subsys(name, struct kobj_type *type, struct kset_hotplug_ops *hotplug_ops);

void subsystem_init(struct subsystem *subsys);
int subsystem_register(struct subsystem *subsys);
void subsystem_unregister(struct subsystem *subsys);
struct subsystem *subsys_get(struct subsystem *subsys)
void subsys_put(struct subsystem *subsys);
struct attribute {
    char *name;
    struct module *owner;
    mode_t mode; // the last entry must be zero-filled
};
struct sysfs_ops {
    ssize_t (*show)(struct kobject *kobj, struct attribute *attr, char *buffer);
    ssize_t (*store)(struct kobject *kobj, struct attribute *attr, const char *buffer, size_t size);
};
int sysfs_create_file(struct kobject *kobj, struct attribute *attr);
int sysfs_remove_file(struct kobject *kobj, struct attribute *attr);
// when user want to upload the firmware
struct bin_attribute {
    struct attribute attr;
    size_t size;
    ssize_t (*read)(struct kobject *kobj, char *buffer, loff_t pos, size_t size);
    ssize_t (*write)(struct kobject *kobj, char *buffer, loff_t pos, size_t size);
};

int sysfs_create_bin_file(struct kobject *kobj, struct bin_attribute *attr);
int sysfs_remove_bin_file(struct kobject *kobj, struct bin_attribute *attr);
// linking between drivers and devices
int sysfs_create_link(struct kobject *kobj, struct kobject *target, char *name);
void sysfs_remove_link(struct kobject *kobj, char *name);
  • hotplug event generation
    • a notification to user space from kernel that something has changed in the system's configration
    • e.g. loading drivers, creating device nodes, mounting partitions
    • /sbin/hotplug
struct kset_hotplug_ops {
    int (*filter)(struct kset *kset, struct kobject *kobj);
    char *(*name)(struct kset *kset, struct kobject *kobj);
    int (*hotplug)(struct kset *kset, struct kobject *kobj, char **envp, int num_envp, char *buffer, int buffer_size);
};
static int block_hotplug_filter(struct kset *kset, struct kobject *kobj) {
    struct kobj_type *ktype = get_ktype(kobj);

    return ((ktype == &ktype_block) || (ktype == &ktype_part));
}
struct bus_type {
    char *name;
    struct subsystem subsys;
    struct kset drivers;
    struct kset devices;
    int (*match)(struct device *dev, struct device_driver *drv);
    struct device *(*add)(struct device * parent, char * bus_id);
    int (*hotplug) (struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size);
    /* Some fields omitted */
};

int bus_register(struct bus_type *bus);
void bus_unregister(struct bus_type *bus);
struct bus_attribute {
    struct attribute attr;
    ssize_t (*show)(struct bus_type *bus, char *buf);
    ssize_t (*store)(struct bus_type *bus, const char *buf, size_t count);
};

int bus_create_file(struct bus_type *bus, struct bus_attribute *attr);
void bus_remove_file(struct bus_type *bus, struct bus_attribute *attr);
struct device {
    struct device *parent;
    struct kobject kobj;
    char bus_id[BUS_ID_SIZE];
    struct bus_type *bus;
    struct device_driver *driver;
    void *driver_data;
    void (*release)(struct device *dev);
    /* Several fields omitted */
};

int device_register(struct device *dev);
void device_unregister(struct device *dev);
struct device_attribute {
    struct attribute attr;
    ssize_t (*show)(struct device *dev, char *buf);
    ssize_t (*store)(struct device *dev, const char *buf, size_t count);
};

DEVICE_ATTR(name, mode, show, store);

int device_create_file(struct device *device, struct device_attribute *entry);
void device_remove_file(struct device *dev, struct device_attribute *attr);
struct device_driver {
    char *name;
    struct bus_type *bus;
    struct kobject kobj;
    struct list_head devices;
    int (*probe)(struct device *dev);
    int (*remove)(struct device *dev);
    void (*shutdown) (struct device *dev);
};

int driver_register(struct device_driver *drv);
void driver_unregister(struct device_driver *drv);
struct driver_attribute {
    struct attribute attr;
    ssize_t (*show)(struct device_driver *drv, char *buf);
    ssize_t (*store)(struct device_driver *drv, const char *buf, size_t count);
};

DRIVER_ATTR(name, mode, show, store);

int driver_create_file(struct device_driver *drv, struct driver_attribute *attr);
void driver_remove_file(struct device_driver *drv, struct driver_attribute *attr);
  • classes
    • e.g. disks
      • SCSI disk
      • ATA disk
    • /sys/class
    • interfaces
      • class_simple
      • regular class
struct class_simple *class_simple_create(struct module *owner, char *name);
void class_simple_destroy(struct class_simple *cs);

struct class_device *class_simple_device_add(
    struct class_simple *cs,
    dev_t devnum,
    struct device *device,
    const char *fmt,
    ...
);

int class_simple_set_hotplug(
    struct class_simple *cs, 
    int (*hotplug)(struct class_device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
);

void class_simple_device_remove(dev_t dev);
  • the full class interface
struct class {
    char *name;
    struct class_attribute *class_attrs;
    struct class_device_attribute *class_dev_attrs;
    int (*hotplug)(struct class_device *dev, char **envp, int num_envp, char *buffer, int buffer_size);
    void (*release)(struct class_device *dev);
    void (*class_release)(struct class *class);
    /* Some fields omitted */
};

int class_register(struct class *cls);
void class_unregister(struct class *cls);
struct class_attribute {
    struct attribute attr;
    ssize_t (*show)(struct class *cls, char *buf);
    ssize_t (*store)(struct class *cls, const char *buf, size_t count);
};

CLASS_ATTR(name, mode, show, store);

int class_create_file(struct class *cls, const struct class_attribute *attr);
void class_remove_file(struct class *cls, const struct class_attribute *attr);
  • class devices
struct class_device {
    struct kobject kobj;
    struct class *class;
    struct device *dev;
    void *class_data;
    char class_id[BUS_ID_SIZE];
};

int class_device_register(struct class_device *cd);
void class_device_unregister(struct class_device *cd);

int class_device_rename(struct class_device *cd, char *new_name);
struct class_device_attribute {
   struct attribute attr;
   ssize_t (*show)(struct class_device *cls, char *buf);
   ssize_t (*store)(struct class_device *cls, const char *buf, size_t count);
};

CLASS_DEVICE_ATTR(name, mode, show, store);

int class_device_create_file(struct class_device *cls, const struct class_device_attribute *attr);
void class_device_remove_file(struct class_device *cls, const struct class_device_attribute *attr);
  • class interfaces
struct class_interface {
    struct class *class;
    int (*add) (struct class_device *cd);
    void (*remove) (struct class_device *cd);
};

int class_interface_register(struct class_interface *intf);
void class_interface_unregister(struct class_interface *intf);
  • add a device
    • pci_bus_type variable is registered with the driver core when the PCI subsystem is loaded in the kernel with a call to bus_register
    • the driver core creates a sysfs directory in /sys/bus/pci that consists of two directories: drvices and drivers
    • all PCI drivers must define a struct pci_driver variable
      • the structure contains a struct device_driver initialized by the PCI core when the PCI driver is registered
    • driver_register
    • device_register
struct bus_type pci_bus_type = {
    .name      = "pci",
    .match     = pci_bus_match,
    .hotplug   = pci_hotplug,
    .suspend   = pci_device_suspend,
    .resume    = pci_device_resume,
    .dev_attrs = pci_dev_attrs,
};
struct pci_dev {
    /* ... */
    unsigned int   devfn;
    unsigned short vendor;
    unsigned short device;
    unsigned short subsystem_vendor;
    unsigned short subsystem_device;
    unsigned int   class;
    /* ... */
    struct pci_driver *driver;
    /* ... */
    struct device dev;
    /* ... */
};
$ tree -d /sys/bus/pci
/sys/bus/pci
├── devices
│   ├── 18b5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/46d9ad56-18b5-4d77-b62f-0b5891160bb1/pci18b5:00/18b5:00:00.0
│   ├── 1fe5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/502c91c3-1fe5-4692-9534-b0c68e0b8170/pci1fe5:00/1fe5:00:00.0
│   ├── 538d:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/400930b1-538d-4d70-9538-26c4b4e104a7/pci538d:00/538d:00:00.0
│   ├── aec5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/0399541e-aec5-4829-b1f1-6154168c0f99/pciaec5:00/aec5:00:00.0
│   ├── c2a1:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/8ff4cdbd-c2a1-488e-8af1-b07e7ba7d476/pcic2a1:00/c2a1:00:00.0
│   └── c546:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/9fb76897-c546-420b-9873-997bc89c414d/pcic546:00/c546:00:00.0
├── drivers
│   ├── dxgkrnl
│   │   ├── 1fe5:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/502c91c3-1fe5-4692-9534-b0c68e0b8170/pci1fe5:00/1fe5:00:00.0
│   │   └── module -> ../../../../module/dxgkrnl
│   ├── pcieport
│   ├── serial
│   ├── vfio-pci
│   │   └── module -> ../../../../module/vfio_pci
│   └── virtio-pci
│       ├── 18b5:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/46d9ad56-18b5-4d77-b62f-0b5891160bb1/pci18b5:00/18b5:00:00.0
│       ├── 538d:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/400930b1-538d-4d70-9538-26c4b4e104a7/pci538d:00/538d:00:00.0
│       ├── aec5:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/0399541e-aec5-4829-b1f1-6154168c0f99/pciaec5:00/aec5:00:00.0
│       ├── c2a1:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/8ff4cdbd-c2a1-488e-8af1-b07e7ba7d476/pcic2a1:00/c2a1:00:00.0
│       ├── c546:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/9fb76897-c546-420b-9873-997bc89c414d/pcic546:00/c546:00:00.0
│       └── module -> ../../../../module/virtio_pci
└── slots
    ├── 1074344113
    ├── 1188670806
    ├── 1345098179
    ├── 2415185341
    ├── 2679597207
    └── 60380190

29 directories
  • remove a deveice
    • fakephp.c
    • pci_remove_bus_device
      • device_unregister
        • kobject_del
          • release
          • pci_release_dev
  • add a driver
    • pci_register_driver
      • struct device_driver
        • struct pci_driver
      • driver_register
        • bus_add_driver
  • remove a driver
    • pci_unregister_driver
      • driver_unregister
        • release
down(&drv->unload_sem);
up(&drv->unload_sem);
result = readl(ptr);
if (result == ~(u32)0) {
    return -ENODEV;  /* card removed */
}
  • /sbin/hotplug
    • hotplug(8)
DIR="/etc/hotplug.d"
for I in "${DIR}/$1/"*.hotplug "${DIR}/"default/*.hotplug ; do
    if [ -f $I ]; then
        test -x $I && $I $1 ;
    fi
done
exit 1
struct kset_hotplug_ops
  • environment variables
    • IEEE1394 (FireWire)
      • VENDOR_ID
      • MODEL_ID
      • GUID
      • SPECIFIER_ID
      • VERSION
    • Networking
      • INTERFACE
        • lo, eth0
    • PCI
      • PCI_CLASS
        • hex
      • PCI_ID
        • vendor:device hex
      • PCI_SUBSYSTEM_ID
        • subsys_vendor:subsys_device
      • PCI_SLOT_NAME
        • domain:bus:slot:function
          • 0000:00:0d.0
    • Input (mice, keyboards, joysticks, etc.)
      • PRODUCT
        • bustype:vendor:product:version hex
      • if the device supports:
        • NAME
        • PHYS
        • EV
        • KEY
        • REL
        • ABS
        • MSC
        • LED
        • SND
        • FF
    • USB
      • PRODUCT
        • idVendor/idProduct/bcdDevice
      • TYPE
        • bDeviceClass/bDeviceSubClass/bDeviceProtocol
      • if the bDeviceClass field is set to 0:
        • INTERFACE
          • bInterfaceClass/bInterfaceSubClass/bInterfaceProtocol
      • if the kernel build option, CONFIG_USB_DEVICEFS
        • DEVICE
          • /proc/bus/usb/USB_BUS_NUMBER/USB_DEVICE_NUMBER
    • SCSI
    • Laptop docking stations
    • S/390 and zSeries
  • Linux hotplug scripts
    • MODULE_DEVICE_TABLE
    • /lib/module/KERNEL_VERSION/modules.*map
  • udev
    • class_simple_create
    • class_simple_device_add
    • class_simple_device_remove
    • class_simple_destroy
  • dealing with firmware
#include <linux/firmware.h>
int request_firmware(const struct firmware **fw, char *name, struct device *device);

struct firmware {
    size_t size;
    u8 *data;
};

void release_firmware(struct firmware *fw);

int request_firmware_nowait(
    struct module *module, 
    char *name,
    struct device *device,
    void *context,
    void (*cont)(const struct firmware *fw, void *context)
);

Chapter 15

struct page *virt_to_page(void *kaddr);
struct page *pfn_to_page(int pfn); // page frame number
void *page_address(struct page *page); // use kmap instead
void *kmap(struct page *page);
void kunmap(struct page *page);
void *kmap_atomic(struct page *page, enum km_type type);
void kunmap_atomic(void *addr, enum km_type type);
  • page tables
    • the processor must have a mechanism for translating virtual addresses into its corresponding pyysical addresses
      • on any modern system
      • that is called page table
      • a multilevel tree-structured array
  • virtual memory areas (VMA
    • text ... the program's executable code
    • multible areas for data
      • initialized data
      • uninitialized data
        • BBS .. block started by symbol
      • the program stack
      • one area for each active memory mapping
    • /proc/<pid>/maps
    • /proc/self
    • start-end perm offset major:minor inode image
      • perm's p means private
      • confusingly, for device mappings, the major and minor numbers refer to the disk partition holding the device special file that was opened by the user, and not the device itself
    • struct vm_area_struct
  • to support mmap for user
  • the process memory map
$ cat /proc/self/maps
55ecc9c26000-55ecc9c28000 r--p 00000000 08:20 14816                      /usr/bin/cat
55ecc9c28000-55ecc9c2c000 r-xp 00002000 08:20 14816                      /usr/bin/cat
55ecc9c2c000-55ecc9c2e000 r--p 00006000 08:20 14816                      /usr/bin/cat
55ecc9c2e000-55ecc9c2f000 r--p 00007000 08:20 14816                      /usr/bin/cat
55ecc9c2f000-55ecc9c30000 rw-p 00008000 08:20 14816                      /usr/bin/cat
55ecca6ba000-55ecca6db000 rw-p 00000000 00:00 0                          [heap]
7fdd4762f000-7fdd47651000 rw-p 00000000 00:00 0
7fdd47651000-7fdd476a8000 r--p 00000000 08:20 42519                      /usr/lib/locale/C.utf8/LC_CTYPE
7fdd476a8000-7fdd476a9000 r--p 00000000 08:20 42552                      /usr/lib/locale/C.utf8/LC_NUMERIC
7fdd476a9000-7fdd476aa000 r--p 00000000 08:20 42569                      /usr/lib/locale/C.utf8/LC_TIME
7fdd476aa000-7fdd476ab000 r--p 00000000 08:20 40718                      /usr/lib/locale/C.utf8/LC_COLLATE
7fdd476ab000-7fdd476ac000 r--p 00000000 08:20 42536                      /usr/lib/locale/C.utf8/LC_MONETARY
7fdd476ac000-7fdd476ad000 r--p 00000000 08:20 42533                      /usr/lib/locale/C.utf8/LC_MESSAGES/SYS_LC_MESSAGES
7fdd476ad000-7fdd476ae000 r--p 00000000 08:20 42555                      /usr/lib/locale/C.utf8/LC_PAPER
7fdd476ae000-7fdd476af000 r--p 00000000 08:20 42540                      /usr/lib/locale/C.utf8/LC_NAME
7fdd476af000-7fdd476b0000 r--p 00000000 08:20 40486                      /usr/lib/locale/C.utf8/LC_ADDRESS
7fdd476b0000-7fdd47999000 r--p 00000000 08:20 13525                      /usr/lib/locale/locale-archive
7fdd47999000-7fdd4799c000 rw-p 00000000 00:00 0
7fdd4799c000-7fdd479c4000 r--p 00000000 08:20 39598                      /usr/lib/x86_64-linux-gnu/libc.so.6
7fdd479c4000-7fdd47b59000 r-xp 00028000 08:20 39598                      /usr/lib/x86_64-linux-gnu/libc.so.6
7fdd47b59000-7fdd47bb1000 r--p 001bd000 08:20 39598                      /usr/lib/x86_64-linux-gnu/libc.so.6
7fdd47bb1000-7fdd47bb5000 r--p 00214000 08:20 39598                      /usr/lib/x86_64-linux-gnu/libc.so.6
7fdd47bb5000-7fdd47bb7000 rw-p 00218000 08:20 39598                      /usr/lib/x86_64-linux-gnu/libc.so.6
7fdd47bb7000-7fdd47bc4000 rw-p 00000000 00:00 0
7fdd47bc4000-7fdd47bc5000 r--p 00000000 08:20 42563                      /usr/lib/locale/C.utf8/LC_TELEPHONE
7fdd47bc5000-7fdd47bc6000 r--p 00000000 08:20 42522                      /usr/lib/locale/C.utf8/LC_MEASUREMENT
7fdd47bc6000-7fdd47bcd000 r--s 00000000 08:20 42977                      /usr/lib/x86_64-linux-gnu/gconv/gconv-modules.cache
7fdd47bcd000-7fdd47bcf000 rw-p 00000000 00:00 0
7fdd47bcf000-7fdd47bd1000 r--p 00000000 08:20 10010                      /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
7fdd47bd1000-7fdd47bfb000 r-xp 00002000 08:20 10010                      /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
7fdd47bfb000-7fdd47c06000 r--p 0002c000 08:20 10010                      /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
7fdd47c06000-7fdd47c07000 r--p 00000000 08:20 42521                      /usr/lib/locale/C.utf8/LC_IDENTIFICATION
7fdd47c07000-7fdd47c09000 r--p 00037000 08:20 10010                      /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
7fdd47c09000-7fdd47c0b000 rw-p 00039000 08:20 10010                      /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
7ffe673cf000-7ffe673f0000 rw-p 00000000 00:00 0                          [stack]
7ffe673f0000-7ffe673f4000 r--p 00000000 00:00 0                          [vvar]
7ffe673f4000-7ffe673f6000 r-xp 00000000 00:00 0                          [vdso]
$ cat /proc/iomem
00000000-00000000 : Reserved
00000000-00000000 : System RAM
00000000-00000000 : Reserved
00000000-00000000 : System ROM
00000000-00000000 : ACPI Tables
00000000-00000000 : System RAM
  00000000-00000000 : Kernel code
  00000000-00000000 : Kernel rodata
  00000000-00000000 : Kernel data
  00000000-00000000 : Kernel bss
00000000-00000000 : 8ff4cdbd-c2a1-488e-8af1-b07e7ba7d476
00000000-00000000 : 502c91c3-1fe5-4692-9534-b0c68e0b8170
00000000-00000000 : 0399541e-aec5-4829-b1f1-6154168c0f99
00000000-00000000 : 46d9ad56-18b5-4d77-b62f-0b5891160bb1
00000000-00000000 : 400930b1-538d-4d70-9538-26c4b4e104a7
00000000-00000000 : 9fb76897-c546-420b-9873-997bc89c414d
00000000-00000000 : PNP0003:00
00000000-00000000 : Local APIC
  00000000-00000000 : PNP0003:00
00000000-00000000 : System RAM
00000000-00000000 : 8ff4cdbd-c2a1-488e-8af1-b07e7ba7d476
  00000000-00000000 : c2a1:00:00.0
    00000000-00000000 : virtio-pci-modern
  00000000-00000000 : c2a1:00:00.0
  00000000-00000000 : c2a1:00:00.0
    00000000-00000000 : virtio-pci-modern
00000000-00000000 : 0399541e-aec5-4829-b1f1-6154168c0f99
  00000000-00000000 : aec5:00:00.0
    00000000-00000000 : virtio-pci-modern
  00000000-00000000 : aec5:00:00.0
  00000000-00000000 : aec5:00:00.0
    00000000-00000000 : virtio-pci-modern
00000000-00000000 : 400930b1-538d-4d70-9538-26c4b4e104a7
  00000000-00000000 : 538d:00:00.0
    00000000-00000000 : virtio-pci-modern
  00000000-00000000 : 538d:00:00.0
  00000000-00000000 : 538d:00:00.0
    00000000-00000000 : virtio-pci-modern
00000000-00000000 : 9fb76897-c546-420b-9873-997bc89c414d
  00000000-00000000 : c546:00:00.0
    00000000-00000000 : virtio-pci-modern
  00000000-00000000 : c546:00:00.0
  00000000-00000000 : c546:00:00.0
    00000000-00000000 : virtio-pci-modern
00000000-00000000 : 711dad3a-73ce-468b-90a9-ede6906841b2
00000000-00000000 : 46d9ad56-18b5-4d77-b62f-0b5891160bb1
  00000000-00000000 : 18b5:00:00.0
    00000000-00000000 : virtio2
  00000000-00000000 : 18b5:00:00.0
    00000000-00000000 : virtio-pci-modern
  00000000-00000000 : 18b5:00:00.0
  • the mmap device operation
mmap (caddr_t addr, size_t len, int prot, int flags, int fd, off_t offset)
int (*mmap) (struct file *filp, struct vm_area_struct *vma);
  • vma->vm_ops
// refers to actual system RAM
int remap_pfn_range(
  struct vm_area_struct *vma, 
  unsigned long virt_addr,
  unsigned long pfn, // page frame number, vma->vm_pgoff
  unsigned long size,
  pgprot_t prot // protection, vma->vm_page_prot
);

// phys_addr points to I/O memory
int io_remap_page_range(
  struct vm_area_struct *vma, 
  unsigned long virt_addr,
  unsigned long phys_addr,
  unsigned long size,
  pgprot_t prot
);
  • references to device memory should not be cached by the processor
    • often the system BIOS sets things up properly, but
    • pgprot_noncached
  • simple
    • Simple Implementation Mapping Pages with Little Enthusiasm
static int simple_remap_mmap(struct file *filp, struct vm_area_struct *vma) {
    if (remap_pfn_range(vma, vma->vm_start, vm->vm_pgoff, vma->vm_end - vma->vm_start, vma->vm_page_prot)) {
        return -EAGAIN;
    }

    vma->vm_ops = &simple_remap_vm_ops;
    simple_vma_open(vma);
    return 0;
}
void simple_vma_open(struct vm_area_struct *vma) {
    printk(KERN_NOTICE "Simple VMA open, virt %lx, phys %lx\n", vma->vm_start, vma->vm_pgoff << PAGE_SHIFT);
}

void simple_vma_close(struct vm_area_struct *vma) {
    printk(KERN_NOTICE "Simple VMA close.\n");
}

static struct vm_operations_struct simple_remap_vm_ops = {
    .open =  simple_vma_open,
    .close = simple_vma_close,
};

vma->vm_ops = &simple_remap_vm_ops;
simple_vma_open(vma);
  • implement nopage
    • to support mremap
    • you must use remap_pfn_range for the PCI bus
      • nopage does not work with PCI memory areas, extention of PCI mappings is not possible
struct page *(*nopage)(struct vm_area_struct *vma, unsigned long address, int *type);

get_page(struct page *pageptr);
unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
unsigned long physical = simple_region_start + off;
unsigned long vsize = vma->vm_end - vma->vm_start;
unsigned long psize = simple_region_size - off;

if (vsize > psize) {
    return -EINVAL; /* spans too high */
}

remap_pfn_range(vma, vma_>vm_start, physical, vsize, vma->vm_page_prot);
// prevent to map the zero page if over
struct page *simple_nopage(struct vm_area_struct *vma, unsigned long address, int *type) {
    return NOPAGE_SIGBUS; /* send a SIGBUS */
}
#include <linux/mm.h>
int get_user_pages(
  struct task_struct *tsk, 
  struct mm_struct *mm, 
  unsigned long start,
  int len, 
  int write, 
  int force, 
  struct page **pages, 
  struct vm_area_struct **vmas
);

down_read(&current->mm->mmap_sem);
result = get_user_pages(current, current->mm, ...);
up_read(&current->mm->mmap_sem);
#include <linux/page-flags.h>
void SetPageDirty(struct page *page);
if (!PageReserved(page)) {
  SetPageDirty(page);
}

void page_cache_release(struct page *page);
ssize_t (*aio_read) (struct kiocb *iocb, char *buffer, size_t count, loff_t offset);
ssize_t (*aio_write) (struct kiocb *iocb, const char *buffer, size_t count, loff_t offset);
int (*aio_fsync) (struct kiocb *iocb, int datasync);

// the kernel ocasionaly creates synchronous IOCBs, your driver should query it:
int is_sync_kiocb(struct kiocb *iocb);
// if this function returns a nonzero value, your driver must execute the operation synchronously

int aio_complete(struct kiocb *iocb, long res, long res2);
// if you need a huge buffer, but it is discouraged
dmabuf = ioremap(0xFF00000 /* 255M */, 0x100000 /* 1M */);

// the use of these functions is strongly discouraged, use the generic DMA layer instead
unsigned long virt_to_bus(volatile void *address);
void *bus_to_virt(unsigned long address);
// if your device not support 32-bit DMA operations
int dma_set_mask(struct device *dev, u64 mask);

if (dma_set_mask(dev, 0xffffff)) {
    card->use_dma = 1;
} else {
    card->use_dma = 0;   /* We'll have to live without DMA */
    printk(KERN_WARN, "mydev: DMA not supported\n");
}
  • virt_io_bus is not suitable, IOMMU
  • a bounce buffer
  • cache coherency, be flushed first out
  • dma_addr_t
  • streaming DMA mappings, it is recommended by the kernel developers, over coherent DMA mappings whenever possible
void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, int flag);
void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle);
struct dma_pool *dma_pool_create(
  const char *name,
  struct device *dev,
  size_t size,
  size_t align,
  size_t allocation,
);

void *dma_pool_alloc(struct dma_pool *pool, int mem_flags, dma_addr_t *handle);
void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);

void dma_pool_destroy(struct dma_pool *pool);
  • streaming DMA buffer mappings
dma_addr_t dma_map_single(struct device *dev, void *buffer, size_t size, enum dma_data_direction direction);
void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction direction);

void dma_sync_single_for_cpu(struct device *dev, dma_handle_t bus_addr, size_t size, enum dma_data_direction direction);
void dma_sync_single_for_device(struct device *dev, dma_handle_t bus_addr, size_t size, enum dma_data_direction direction);

// single-page streaming mappings
dma_addr_t dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction);
void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, enum dma_data_direction direction);
int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction);
void dma_unmap_sg(struct device *dev, struct scatterlist *list, int nents, enum dma_data_direction direction);

dma_addr_t sg_dma_address(struct scatterlist *sg);
unsigned int sg_dma_len(struct scatterlist *sg);

void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction);
void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction);
$ cat /proc/dma
 4: cascade
int request_dma(unsigned int channel, const char *name); 
void free_dma(unsigned int channel);

unsigned long claim_dma_lock( );
void release_dma_lock(unsigned long flags);

void set_dma_mode(unsigned int channel, char mode);
void set_dma_addr(unsigned int channel, unsigned int addr);
void set_dma_count(unsigned int channel, unsigned int count);

void disable_dma(unsigned int channel);
void enable_dma(unsigned int channel);
int get_dma_residue(unsigned int channel);
void clear_dma_ff(unsigned int channel);
unsigned long flags;

flags = claim_dma_lock();
disable_dma(channel);
clear_dma_ff(channel);
set_dma_mode(channel, mode);
set_dma_addr(channel, virt_to_bus(buf));
set_dma_count(channel, count);
enable_dma(channel);
release_dma_lock(flags);
int residue;
unsigned long flags = claim_dma_lock ();
residue = get_dma_residue(channel);
release_dma_lock(flags);
// residue == 0

Chapter 16

int register_blkdev(unsigned int major, const char *name);
int unregister_blkdev(unsigned int major, const char *name);

struct gendisk *alloc_disk(int minors);
void del_gendisk(struct gendisk *gd);
void add_disk(struct gendisk *gd);
$ cat /proc/partitions
major minor  #blocks  name

   1        0      65536 ram0
   1        1      65536 ram1
   1        2      65536 ram2
   1        3      65536 ram3
   1        4      65536 ram4
   1        5      65536 ram5
   1        6      65536 ram6
   1        7      65536 ram7
   1        8      65536 ram8
   1        9      65536 ram9
   1       10      65536 ram10
   1       11      65536 ram11
   1       12      65536 ram12
   1       13      65536 ram13
   1       14      65536 ram14
   1       15      65536 ram15
   8        0     372040 sda
   8       16    1048580 sdb
   8       32  268435456 sdc
static int sbull_open(struct inode *inode, struct file *filp) {
    struct sbull_dev *dev = inode->i_bdev->bd_disk->private_data;
    del_timer_sync(&dev->timer);
    filp->private_data = dev;
    spin_lock(&dev->lock);
    if (! dev->users) {
        check_disk_change(inode->i_bdev);
    }
    dev->users++;
    spin_unlock(&dev->lock);
    return 0;
}

static int sbull_release(struct inode *inode, struct file *filp) {
    struct sbull_dev *dev = inode->i_bdev->bd_disk->private_data;
    spin_lock(&dev->lock);
    dev->users--;
    if (!dev->users) {
        dev->timer.expires = jiffies + INVALIDATE_DELAY;
        add_timer(&dev->timer);
    }
    spin_unlock(&dev->lock);
    return 0;
}

int sbull_media_changed(struct gendisk *gd) {
    struct sbull_dev *dev = gd->private_data;
    return dev->media_change;
}

int sbull_revalidate(struct gendisk *gd) {
    struct sbull_dev *dev = gd->private_data;
    if (dev->media_change) {
        dev->media_change = 0;
        memset (dev->data, 0, dev->size);
    }
    return 0;
}

int sbull_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) {
    long size;
    struct hd_geometry geo;
    struct sbull_dev *dev = filp->private_data;
    switch(cmd) {
        case HDIO_GETGEO:
        /*
         * Get geometry: since we are a virtual device, we have to make
         * up something plausible.  So we claim 16 sectors, four heads,
         * and calculate the corresponding number of cylinders.  We set the
         * start of data at sector four.
         */
        size = dev->size*(hardsect_size/KERNEL_SECTOR_SIZE);
        geo.cylinders = (size & ~0x3f) >> 6;
        geo.heads = 4;
        geo.sectors = 16;
        geo.start = 4;
        if (copy_to_user((void _ _user *) arg, &geo, sizeof(geo))) {
            return -EFAULT;
        }
        return 0;
    }
    return -ENOTTY; /* unknown command */
}
  • request processing
    • an example for the simple way
    • real world block devices dedicates the performance optimization
void request(request_queue_t *queue);

dev->queue = blk_init_queue(sbull_request, &dev->lock);
void end_request(struct request *req, int succeeded);

static void sbull_request(request_queue_t *q)
{
    struct request *req;

    while ((req = elv_next_request(q)) != NULL) {
        struct sbull_dev *dev = req->rq_disk->private_data;
        if (! blk_fs_request(req)) {
            printk (KERN_NOTICE "Skip non-fs request\n");
            end_request(req, 0);
            continue;
        }
        sbull_transfer(dev, req->sector, req->current_nr_sectors, req->buffer, rq_data_dir(req));
        end_request(req, 1);
    }
}
static void sbull_transfer(struct sbull_dev *dev, unsigned long sector, unsigned long nsect, char *buffer, int write) {
    unsigned long offset = sector*KERNEL_SECTOR_SIZE;
    unsigned long nbytes = nsect*KERNEL_SECTOR_SIZE;

    if ((offset + nbytes) > dev->size) {
        printk (KERN_NOTICE "Beyond-end write (%ld %ld)\n", offset, nbytes);
        return;
    }

    if (write) {
        memcpy(dev->data + offset, buffer, nbytes);
    } else {
        memcpy(buffer, dev->data + offset, nbytes);
    }
}
request_queue_t *blk_init_queue(request_fn_proc *request, spinlock_t *lock);
void blk_cleanup_queue(request_queue_t *);

struct request *elv_next_request(request_queue_t *queue);
void blkdev_dequeue_request(struct request *req);
void elv_requeue_request(request_queue_t *queue, struct request *req);
int end_that_request_first(struct request *req, int success, int count);
void end_that_request_last(struct request *req);
  • doing without a request queue
    • "no queue" mode
    • for flash memory, RAM disks, software RAID arrays, etc...
// The make_request should return 0, regardless of whether the I/O is successful.
typedef int (make_request_fn) (request_queue_t *q, struct bio *bio);

void bio_endio(struct bio *bio, unsigned int bytes, int error);

request_queue_t *blk_alloc_queue(int flags);
void blk_queue_make_request(request_queue_t *queue, make_request_fn *func);
dev->queue = blk_alloc_queue(GFP_KERNEL);
if (dev->queue == NULL) {
    goto out_vfree;
}
blk_queue_make_request(dev->queue, sbull_make_request);
  • TCQ ... tagged command queueing

Chapter 17

$ cat /etc/networks
snullnet0       192.168.0.0
snullnet1       192.168.1.0

$ cat /etc/hosts
192.168.0.1   local0
192.168.0.2   remote0
192.168.1.2   local1
192.168.1.1   remote1

$ ifconfig sn0 local0
$ ifconfig sn1 local1
// size of private data, name is printf style
struct net_device *alloc_netdev(int sizeof_priv,  const char *name, void (*setup)(struct net_device *));
struct net_device *alloc_etherdev(int sizeof_priv);

struct net_device *snull_devs[2];
snull_devs[0] = alloc_netdev(sizeof(struct snull_priv), "sn%d", snull_init);
snull_devs[1] = alloc_netdev(sizeof(struct snull_priv), "sn%d", snull_init);
if (snull_devs[0] == NULL || snull_devs[1] == NULL) {
    goto out;
}
for (i = 0; i < 2;  i++) {
    if ((result = register_netdev(snull_devs[i]))) {
        printk("snull: error %i registering device \"%s\"\n", result, snull_devs[i]->name);
    }
}

// direct access is discouraged
struct snull_priv *priv = netdev_priv(dev);

for (int i = 0; i < 2;  i++) {
    if (snull_devs[i]) {
        unregister_netdev(snull_devs[i]);
        snull_teardown_pool(snull_devs[i]);
        free_netdev(snull_devs[i]);
    }
}
  • ARP ... address resolition protocol
    • IFF_NOARP
    • MAC ... Ethernet medium access control addresses
  • UTP ... unshielded twisted pair
  • MTU ... maximum transfer unit
  • SIOCSIFADDR ... Socket I/O Control Set Interface Address
  • SIOCSIFFLAGS ... Socket I/O Control Set Interface Flags
void netif_start_queue(struct net_device *dev);
void netif_stop_queue(struct net_device *dev);
void netif_wake_queue(struct net_device *dev);
void netif_tx_disable(struct net_device *dev);
if (skb_shinfo(skb)->nr_frags == 0) {
    /* Just use skb->data and skb->len as usual */
}

struct skb_frag_struct {
    struct page *page;
    __u16 page_offset;
    __u16 size;
};
  • packet reception
void snull_rx(struct net_device *dev, struct snull_packet *pkt) {
    struct sk_buff *skb;
    struct snull_priv *priv = netdev_priv(dev);

    /*
     * The packet has been retrieved from the transmission
     * medium. Build an skb around it, so upper layers can handle it
     */
    skb = dev_alloc_skb(pkt->datalen + 2);
    if (!skb) {
        if (printk_ratelimit()) {
          printk(KERN_NOTICE "snull rx: low on mem - packet dropped\n");
        }
        priv->stats.rx_dropped++;
        goto out;
    }
    memcpy(skb_put(skb, pkt->datalen), pkt->data, pkt->datalen);

    /* Write metadata, and then pass to the receive level */
    skb->dev = dev;
    skb->protocol = eth_type_trans(skb, dev);
    skb->ip_summed = CHECKSUM_UNNECESSARY; /* don't check it */
    priv->stats.rx_packets++;
    priv->stats.rx_bytes += pkt->datalen;
    netif_rx(skb); /* the return value is discarded */
out:
    return;
}

Chapter 18

  • TTY ... teletypewriter
  • in the kernel
    • TTY core
    • TTY line discipline
    • TTY driver
  • types
    • console
    • serial port
    • pty
$ cat /proc/tty/drivers
/dev/tty             /dev/tty        5       0 system:/dev/tty
/dev/console         /dev/console    5       1 system:console
/dev/ptmx            /dev/ptmx       5       2 system
/dev/vc/0            /dev/vc/0       4       0 system:vtmaster
usbserial            /dev/ttyUSB   188 0-511 serial
acm                  /dev/ttyACM   166 0-255 serial
hvc                  /dev/hvc      229 0-7 system
serial               /dev/ttyS       4 64-95 serial
pty_slave            /dev/pts      136 0-1048575 pty:slave
pty_master           /dev/ptm      128 0-1048575 pty:master
unknown              /dev/tty        4 1-63 console
$ ls -la /sys/class/tty/ | head
total 0
drwxr-xr-x  2 root root 0 May 18 13:49 .
drwxr-xr-x 34 root root 0 May 18 13:49 ..
lrwxrwxrwx  1 root root 0 May 18 13:49 console -> ../../devices/virtual/tty/console
lrwxrwxrwx  1 root root 0 May 18 13:49 hvc0 -> ../../devices/virtual/tty/hvc0
lrwxrwxrwx  1 root root 0 May 18 13:49 hvc1 -> ../../devices/virtual/tty/hvc1
lrwxrwxrwx  1 root root 0 May 18 13:49 hvc2 -> ../../devices/virtual/tty/hvc2
lrwxrwxrwx  1 root root 0 May 18 13:49 hvc3 -> ../../devices/virtual/tty/hvc3
lrwxrwxrwx  1 root root 0 May 18 13:49 hvc4 -> ../../devices/virtual/tty/hvc4
lrwxrwxrwx  1 root root 0 May 18 13:49 hvc5 -> ../../devices/virtual/tty/hvc5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment