$ sudo apt install linux-source
$ cp /usr/src/linux-source-5.15.0/linux-source-5.15.0.tar.bz2 /path/to/where
$ tar jxf linux-source-5.15.0.tar.bz2
ERROR: Kernel configuration is invalid.
include/generated/autoconf.h or include/config/auto.conf are missing.
Run 'make oldconfig && make prepare' on kernel src to fix it.
$ sudo apt install libelf-dev
$ make olddefconfig
$ make prepare
$ make modules_prepare
https://stackoverflow.com/a/18479126
- mechanism and policy
- what capabilities are to be provided ... the mechanism
- how those capabilities can be used ... the policy
- the window and session managers
- policy free
- kernel
- process management
- memory management
- filesystems
- device control
- networking
- devices and modules
- character devices
- block devices
- network devices
- version numbering
- even-numbered kernel versions are stable
#include <linux/init.h>
#include <linux/module.h>
MODULE_LICENSE("GPL"); // avoid tainted module layout for kernel
static int hello_init(void)
{
printk(KERN_ALERT "Hello, world\n");
return 0;
}
static void hello_exit(void)
{
printk(KERN_ALERT "Goodbye, cruel world\n");
}
module_init(hello_init);
module_exit(hello_exit);
root# tail -f /var/log/messages
root# insmod ./hello.ko
root# rmmod hello
- event-driven
- no libc included
- not sequential, concurrent, asynchronous
- current process:
#include <linux/sched.h>
printk(KERN_INFO "The process is \"%s\"(pid %d)\n", current->comm, current->pid);
- kernel shared a small stack area, 4096-byte page, we should allocate dynamically if the variable size are large
__
prefixed functions are low-level component, we should use them carefully
- cannot do floating point arithmetic
- Readers is a make-like build tool for kernel
ifneq($(KERNELRELEASE),)
obj-m := hello.o
else
KERNELDIR ?= /lib/modules/$(shell uname -r)/build
default:
$(MAKE) -C $(KERNELDIR) M=$(PWD) modules
endif
insmod
calls sys_init_module
function within kernel/module.c
- system calls are prefixed with
sys_
modprobe
loads other modules that the module needs
lsmod
lists the modules currently loaded
- link the module against
vermagic.o
linux/version.h
- UTS_RELEASE ... string
- LINUX_VERSION_CODE ... binary
- KERNEL_VERSION(major,minor,release)
- optimization each processor family
- module stack
- avoid symbol pollution
EXPORT_SYMBOL(name);
EXPORT_SYMBOL_GPL(name);
MODULE_*
MODULE_LICENSE
MODULE_AUTHOR
MODULE_DESCRIPTION
MODULE_VERSION
MODULE_ALIAS
MODULE_DEVICE_TABLE
- grep
EXPORT_SYMBOL
or register_
for finding entrypoint of other device drivers
- special modifiers
static int __init initialization_function(void)
static void __exit cleanup_function(void)
- error handling
- use goto statement as the best error-recovery tool for lessen cpu time
<linux/errno.h>
- negative numbers
- module-loading races
- module parameters
/etc/modprobe.conf
- moduleparam.h
- bool, invbool, charp, int, long, short, uint, ulong, ushort
$ insmod my_module param_one=foo param_two=7 param_three=a,b,c,d,e
static char *param_one = "baz";
static int param_two = 1;
static char **param_three = ["a", "b", "c"];
// the 3rd arg is a permission for sysfs, see <linux/stat.h>
// if the permission makes writable, the module should be able to detect it by self
module_param(param_one, charp, S_IRUGO);
module_param(param_two, int, S_IRUGO);
module_param_array(param_three, charp, 12, S_IRUGO);
- user-space device driver
- we can use the libc
- as a server process
- overhead of the context switch
- require some privileges
- e.g.
- gadgetfs
- the X server
- the following user-space drivers relu on SCSI generic kernel-space driver
- SCSI scanner drivers (in SANE pkg)
- CD writers (in cdrecord pkg)
- See
/sys/module
, /proc/modules
- charactor devices are identified by
c
like this:
$ ls -l /dev/ | head
total 0
crw-r--r-- 1 root root 10, 235 Feb 22 16:17 autofs
drwxr-xr-x 2 root root 580 Feb 22 16:17 block
drwxr-xr-x 2 root root 100 Feb 22 16:17 bsg
crw-rw---- 1 root disk 10, 234 Feb 22 16:17 btrfs-control
drwxr-xr-x 3 root root 60 Feb 22 16:17 bus
drwxr-xr-x 2 root root 2720 Feb 22 16:17 char
crw--w---- 1 root tty 5, 1 Feb 22 16:18 console
lrwxrwxrwx 1 root root 11 Feb 22 16:17 core -> /proc/kcore
crw------- 1 root root 10, 125 Feb 22 16:17 cpu_dma_latency
- comma separated numbers are major and minor version
- major numbers ... 1,4,7, and 10
- minor numbers ... 1,3,5,64,65 and 129
- one-major-one-driver principle
#include <linux/types.h>
MOJOR(dev_t dev);
MINOR(dev_t dev);
MKDEV(int major, int minor);
#include <linux/fs.h>
# it returns zero or a negative error code
int register_chrdev_region(dev_t first, unsigned int count, char *name);
# dynamically
int alloc_chrdev_region(dev_t *dev, unsigned int firstminor, unsigned int count, char *name);
void unregister_chrdev_region(dev_t first, unsigned int count);
- Use dynamic allocation to obtain your major device number
$ less /proc/devices
$ mknod NAME TYPE MOJOR MINOR
<linux/fs.h>
- the
file_operations
structure
- the
inode_operations
structure
- the
file
structure
struct file_operations scull_fops = {
.owner = THIS_MODULE,
.llseek = scull_llseek,
.read = scull_read,
.write = scull_write,
.ioctl = scull_ioctl,
.open = scull_open,
.release = scull_release,
};
void cdev_init(struct cdev *, struct file_operations *);
// num is the first number of device, count is the number of associated with device
// cdev_add shoud be called when everything is ready
int cdev_add(struct cdev *, dev_t num, unsigned int count);
void cdev_del(struct cdev *);
- registration, open, release, write, read
- pread, pwrite ... they don't change the file position
- readv, writev ... vector version
#include <linux/slab.h>
void *kmalloc(size_t size, int flags);
void kfree(void *ptr);
- DO NOT DEREFERENCE DIRECTLY user space buffer
// signed size type
ssize_t read(struct file *filp, char __user *buff, size_t count, loff_t *offp);
ssize_t write(struct file *filp, const char __user *buff, size_t count, loff_t *offp);
// page cache consideration, swappable
// user-space pointer validation
#include <linux/uaccess.h>
unsigned long copy_to_user(void __user *to, const void *from, unsigned long count);
unsigned long copy_from_user(void *to, const void __user *from, unsigned long count);
- we could not use generic debugger, but can use several configurations for debugging to kernel development
CONFIG_DEBUG_KERNEL
CONFIG_DEBUG_INFO
CONFIG_DEBUG_DRIVER
CONFIG_INPUT_EVBUG
- etc...
- loglevel
$ cat /proc/sys/kernel/printk
4 4 1 7
## current, default, minimum, boot-time default
int printk_ratelimit(void);
if (printk_ratelimit())
printk(KERN_NOTICE "The printer is still on fire\n");
$ cat /proc/sys/kernel/printk_ratelimit
5 # the number of seconds to wait before re-enabling messages
$ cat /proc/sys/kernel/printk_ratelimit_burst
10 # the number of messages accepted before rate-limiting
int print_dev_t(char *buffer, dev_t dev);
char *format_dev_t(char *buffer, dev_t dev);
/proc/*
- They are used by the kernel to export information to the world
- They are generated on the fly when the file is read
- They are tied to a kernel function
- Several commands are get information such as
ps
, top
and uptime
so on
- read only basically
- via sysfs, recommended way
- obsolete and older way
- better and newer way
- the best way, you should implement ioctl methods
- faster than reading
/proc
int (*read_proc)(char *page, char **start, off_t offset, int count, int *eof, void *data);
struct proc_dir_entry *create_proc_read_entry(const char *name, mode_t mode, struct proc_dir_entry *base, read_proc_t *read_proc, void *data);
remove_proc_entry(const char *name, struct proc_dir_entry *base);
arg |
desc |
page |
the buffer where you'll write your data |
start |
is used by the function to say where the interesting data has been written in page, is useful if the buffer is greater than one page, may be NULL |
offset |
same as the read function |
count |
same as the read function |
eof |
must be set by the driver to signal that it has no more data to return |
data |
you can use for internal bookkeeping |
arg |
desc |
name |
the name of the file in /proc |
mode |
the protection mask, zero is system-wide default |
base |
the parent dir, NULL is /proc root |
read_proc |
the function |
data |
the client data, ignored by kernel |
void *start(struct seq_file *sfile, loff_t *pos);
void *next(struct seq_file *sfile, void *v, loff_t *pos);
void stop(struct seq_file *sfile, void *v);
int show(struct seq_file *sfile, void *v);
// you should not use printk in show function, instead:
int seq_printf(struct seq_file *sfile, const char *fmt, ...);
int seq_putc(struct seq_file *sfile, char c);
int seq_puts(struct seq_file *sfile, const char *s);
int seq_escape(struct seq_file *sfile, const char *s, const char *esc);
int seq_path(struct seq_file *sfile, struct vfsmount *m, struct dentry *dentry, char *esc);
struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent);
entry->proc_fops = &your_fops;
- use strace command
- with useful options, such as
-t
, -T
, -e
, -o
, ...
- oops messages
- https://github.com/torvalds/linux/blob/master/arch/um/kernel/trap.c
- EIP ... the instruction pointer
- on the x86 architecture, by default, the user-space stack starts just below
0xc0000000
- the kernel-space stack starts at
0xc0000000
- slab poisoning, the offending address is
oxa5a5a5a5
, you may forget to initialize dynamic memory address
- system hangs
- magic System Request (SysRq) keys
$ gdb /usr/src/linux/vmlinux /proc/kcore
- uncompressed ELF kernel executable, not the zImage or bzImage or else
- .txt, .bss, .data
/sys/module/*/sections
(gdb) print *(address)
- kdb, kgdb
- user-mode linux (UML)
- has no access to the host's hardwares
- linux trace toolkit (LTT)
- dynamic probes (DProbes)
- SMP, symmetric multiprocessing
- workqueues, tasklets, timers
- against critical sections, use semaphore
- holding, taken out, acquired a lock
#include <asm/semaphore.h>
void sema_init(struct semaphore *sem, int val);
DECLARE_MUTEX(name); // 1
DECLARE_MUTEX_LOCKED(name); // 0
// at runtime
void init_MUTEX(struct semaphore *sem);
void init_MUTEX_LOCKED(struct semaphore *sem);
void down(struct semaphore *sem);
int down_interruptible(struct semaphore *sem);
int down_trylock(struct semaphore *sem);
void up(struct semaphore *sem);
- You should undo any user-vislble changes if you returns
-ERESTARTSYS
- if you cannot undo things, you should return
-EINTR
instead
- rwsem (reader/writer semaphore)
#include <linux/rwsem.h>
void init_rwsem(struct rw_semaphore *sem);
void down_read(struct rw_semaphore *sem);
int down_read_trylock(struct rw_semaphore *sem);
void up_read(struct rw_semaphroe *sem);
void down_write(struct rw_semaphore *sem);
int down_write_trylock(struct rw_semaphore *sem);
void up_write(struct rw_semaphore *sem);
void downgrade_write(struct rw_semaphore *sem);
#include <linux/completion.h>
DECLARE_COMPLETION(struct completion c);
init_completion(struct completion *c);
void wait_for_completion(struct completion *c);
void complete(struct completion *c);
void complete_all(struct completion *c);
INIT_COMPLETION(struct completion c); // reuse
void complete_and_exit(struct completion *c, long retval);
- spinlocks
- don't let a process into sleep
- tight loop
- avoid deadlocks in hyperthreaded processors
- spinning forever in nonpreemptive uniprocessor
- be carefully, pay attension, writing code that will execute under a spinlock, avoid sleep
- be minimum holding time of lock
- irq ... interrupt request
#include <linux/spinlock.h>
spinlock_t my_lock = SPIN_LOCK_UNLOCKED; // at compile time
void spin_lock_init(spinlock_t *lock); // at runtime
void spin_lock(spinlock_t *lock);
void spin_unlock(spinlock_t *lock);
void spin_lock(spinlock_t *lock);
void spin_lock_irqsave(spinlock_t *lock, unsigned long flags); // disables interrupts on the local processor only
void spin_lock_irq(spinlock_t *lock);
void spin_lock_bh(spinlock_t *lock); // disables software interrupts, but leaves hardware interrupts
void spin_unlock(spinlock_t *lock);
void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags);
void spin_unlock_irq(spinlock_t *lock);
void spin_unlock_bh(spinlock_t *lock);
int spin_trylock(spinlock_t *lock);
int spin_trylock_bh(spinlock_t *lock);
rwlock_t my_rwlock = RW_LOCK_UNLOCKED; // static
void rwlock_init(rwlock_t *); // dynamic
...
- coarse-grained locking -> fine-grained locking
- the big kernel lock
- more complex
- lock-free algorithms
- atomic variables
- bit operations
- seqlocks
- read-copy-update (RCU)
// iotcl(2) in user-space
int ioctl(int fd, unsigned long cmd, ...);
// in kernel-space
int (*ioctl)(struct inode *inode, struct fule *filp, unsigned int cmd, unsigned long arg);
int access_ok(int type, const void *addr, unsigned long size);
// e.g.
if (!access_ok(VERIFY_WRITE, addr, sizeof(int))) {
return -EFAULT;
}
// faster than copy_{from|to}_user functions
// but, if you receive a message like "conversion to non-scalar type requested" from the compiler,
// you should use copy_{from|to}_user functions insted
put_user(datum, ptr); // calling access_ok internally
__put_user(datum, ptr);
get_user(local, ptr);
__get_user(local, ptr);
#include <linux/sched.h>
int capable(int capability);
// e.g.
if (!capable(CAP_SYS_ADMIN)) {
return -EPERM;
}
- safe manners
- you never sleep when you are running in an atomic context
- you never know how long your process may have been out of the CPU or what may have changed in the mean time
- you must check to ensure that the condition you were waiting for is, indeed, true
- use wait queue
#include <linux/wait.h>
DECLARE_WAIT_QUEUE(name); // static
init_waitqueue_head(wait_queue_head_t *); // dynamic
- nonblocking
- input / output buffer
- performance gain, prevent overheads of the context switches
struct scull_pipe {
wait_queue_head_t inq, outq; /* read and write queues */
char *buffer, *end; /* begin of buf, end of buf */
int buffersize; /* used in pointer arithmetic */
char *rp, *wp; /* where to read, where to write */
int nreaders, nwriters; /* number of openings for r/w */
struct fasync_struct *async_queue; /* asynchronous readers */
struct semaphore sem; /* mutual exclusion semaphore */
struct cdev cdev; /* Char device structure */
};
static ssize_t scull_p_read (struct file *filp, char _ _user *buf, size_t count, loff_t *f_pos) {
struct scull_pipe *dev = filp->private_data;
if (down_interruptible(&dev->sem)) {
return -ERESTARTSYS;
}
while (dev->rp == dev->wp) { /* nothing to read */
up(&dev->sem); /* release the lock */
if (filp->f_flags & O_NONBLOCK) {
return -EAGAIN;
}
PDEBUG("\"%s\" reading: going to sleep\n", current->comm);
if (wait_event_interruptible(dev->inq, (dev->rp != dev->wp))) {
return -ERESTARTSYS; /* signal: tell the fs layer to handle it */
}
/* otherwise loop, but first reacquire the lock */
if (down_interruptible(&dev->sem)) {
return -ERESTARTSYS;
}
}
/* ok, data is there, return something */
if (dev->wp > dev->rp) {
count = min(count, (size_t)(dev->wp - dev->rp));
} else /* the write pointer has wrapped, return data up to dev->end */ {
count = min(count, (size_t)(dev->end - dev->rp));
}
if (copy_to_user(buf, dev->rp, count)) {
up (&dev->sem);
return -EFAULT;
}
dev->rp += count;
if (dev->rp == dev->end) {
dev->rp = dev->buffer; /* wrapped */
}
up(&dev->sem);
/* finally, awake any writers and return */
wake_up_interruptible(&dev->outq);
PDEBUG("\"%s\" did read %li bytes\n",current->comm, (long)count);
return count;
}
void set_current_state(int new_state);
current->state = TASK_INTERRUPTIBLE; // discouraged manner
if (!condition) {
schedule();
}
DEFINE_WAIT(my_wait);
init_wait(*wait_queue_t);
void prepare_to_wait(wait_queue_head_t *queue, wait_queue_t *wait, int state);
void finish_wait(wait_queue_head_t *queue, wait_queue_t *wait);
/* Wait for space for writing; caller must hold device semaphore. On
* error the semaphore will be released before returning. */
static int scull_getwritespace(struct scull_pipe *dev, struct file *filp) {
while (spacefree(dev) == 0) { /* full */
DEFINE_WAIT(wait);
up(&dev->sem);
if (filp->f_flags & O_NONBLOCK) {
return -EAGAIN;
}
PDEBUG("\"%s\" writing: going to sleep\n", current->comm);
prepare_to_wait(&dev->outq, &wait, TASK_INTERRUPTIBLE);
if (spacefree(dev) == 0) {
schedule();
}
finish_wait(&dev->outq, &wait);
if (signal_pending(current)) {
return -ERESTARTSYS; /* signal: tell the fs layer to handle it */
}
if (down_interruptible(&dev->sem)) {
return -ERESTARTSYS;
}
}
return 0;
}
void prepare_to_wait_exclusive(wait_queue_head_t *queue, wait_queue_t *wait, int state);
- poll and select and epoll
unsigned int (*poll)(struct file *filep, poll_table *wait);
void poll_wait(struct file *, wait_queue_head_t *, poll_table *);
static unsigned int scull_p_poll(struct file *filp, poll_table *wait) {
struct scull_pipe *dev = filp->private_data;
unsigned int mask = 0;
/*
* The buffer is circular; it is considered full
* if "wp" is right behind "rp" and empty if the
* two are equal.
*/
down(&dev->sem);
poll_wait(filp, &dev->inq, wait);
poll_wait(filp, &dev->outq, wait);
if (dev->rp != dev->wp) {
mask |= POLLIN | POLLRDNORM; /* readable */
}
if (spacefree(dev)) {
mask |= POLLOUT | POLLWRNORM; /* writable */
}
up(&dev->sem);
return mask;
}
// datasync is used to distinguish between fsync and fdatasync system calls, for filesystem code
int (*fsync)(struct file *file, struct dentry *dentry, int datasync);
- asynchronous notification
signal(SIGIO, &input_handler); /* dummy sumple; sigaction() is better */
fcntl(STDIN_FILENO, F_SETOWN, getpid());
oflags = fcntl(STDIN_FILENO, F_GETFL);
fcntl(STDIN_FILENO, F_SETFL, oflags | FASYNC);
#include <linux/fs.h>
int fasync_helper(int fd, struct file *filp, int mode, struct fasync_struct **fa);
void kill_fasync(struct fasync_struct **fa, int sig, int band);
static int scull_p_fasync(int fd, struct file *filp, int mode) {
struct scull_pipe *dev = filp->private_data;
return fasync_helper(fd, filp, mode, &dev->async_queue);
}
if (dev->async_queue) {
kill_async(&dev->async_queue, SIGIO, POLL_IN);
}
/* remove this filp from the asynchronously notified filp's */
scull_p_fasync(-1, filp, 0);
loff_t scull_llseek(struct file *filp, loff_t off, int whence) {
struct scull_dev *dev = filp->private_data;
loff_t newpos;
switch(whence) {
case 0: /* SEEK_SET */
newpos = off;
break;
case 1: /* SEEK_CUR */
newpos = filp->f_pos + off;
break;
case 2: /* SEEK_END */
newpos = dev->size + off;
break;
default: /* can't happen */
return -EINVAL;
}
if (newpos < 0) {
return -EINVAL;
}
filp->f_pos = newpos;
return newpos;
}
// inform the kernel that your device does not support llseek
int nonseekable_open(struct inode *inode, struct file *filp);
// Also, you should point the no_llseek func to your file_operations structures
- access control
- current->uid, current->euid, capable(CAP_DAC_OVERRIDE)
- linux tape driver helps your understanding of access control
- also, /dev/tty is too
- software devices, virtual devices
- measurering time lapses
- jiffies
- user space
/proc/interrupts
/ /proc/uptime
#include <linux/jiffies.h>
unsigned log j, stamp_1, stamp_half, stamp_n;
j = jiffies; /* read the current value */
stamp_1 = j + HZ; /* 1 second in the future */
stamp_half = j + HZ / 2; /* half a second */
stamp_n = j + n * HZ / 1000; /* n milliseconds */
int time_after(unsigned long a, unsigned long b); /* true when a < b */
int time_before(unsigned long a, unsigned long b);
int time_after_eq(unsigned long a, unsigned long b); /* after or equal */
int time_before_eq(unsigned long a, unsigned long b); /* before or equal */
#include <linux/time.h>
// newer
unsigned long timespec_to_jiffies(struct timespec *value);
void jiffies_to_timespec(unsigned long jiffies, struct timespec *value);
// older and popular
unsigned long timeval_to_jiffies(struct timeval *value);
void jiffies_to_timeval(unsigned long jiffies, struct timeval *value);
- processor-specific registers
unsigned long mktime(unsigned int year, unsigned int mon, unsigned int day, unsigned int hour, unsigned int min, unsigned int sec);
void do_gettimeofday(struct timeval *tv);
struct timespec current_kernel_time(void);
#include <linux/sched.h>
set_current_state(TASK_INTERRUPTIBLE);
signed long schedule_timeout(signed long timeout);
#include <linux/delay.h>
void ndelay(unsigned long nsecs);
void udelay(unsigned long usecs);
void mdelay(unsigned long msecs);
void msleep(unsigned int millisecs);
unsigned long msleep_interruptible(unsigned int millisecs);
void ssleep(unsigned int seconds)
#include <linux/timer.h>
struct timer_list {
/* ... */
unsigned long expires;
void (*function)(unsigned long);
unsigned long data;
};
void init_timer(struct timer_list *timer); // dynamic
struct timer_list TIMER_INITIALIZER(_function, _expires, _data); // static
void add_timer(struct timer_list * timer);
int del_timer(struct timer_list * timer);
#include <linux/interrupt.h>
struct tasklet_struct {
/* ... */
void (*func)(unsigned long);
unsigned long data;
};
void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data);
DECLARE_TASKLET(name, func, data);
DECLARE_TASKLET_DISABLED(name, func, data);
struct workqueue_struct *create_workqueue(const char *name);
struct workqueue_struct *create_singlethread_workqueue(const char *name);
// static
DECLARE_WORK(name, void (*function)(void *), void *data);
// dynamic
INIT_WORK(struct work_struct *work, void (*function)(void *), void *data);
PREPARE_WORK(struct work_struct *work, void (*function)(void *), void *data);
int queue_work(struct workqueue_struct *queue, struct work_struct *work);
int queue_delayed_work(struct workqueue_struct *queue, struct work_struct *work, unsigned long delay);
int cancel_delayed_work(struct work_struct *work);
void flush_workqueue(struct workqueue_struct *queue);
void destroy_workqueue(struct workqueue_struct *queue);
int schedule_work(struct work_struct *work);
int schedule_delayed_work(struct work_struct *work, unsigned long delay);
int cancel_delayed_work(struct work_struct *work);
void flush_scheduled_work(void);
static struct work_struct my_work;
INIT_WORK(&my_work, my_func, &my_data);
prepare_to_wait(&my_wait, &wait, TASK_INTERRUPTIBLE);
schedule_work(&my_work);
schedule();
finish_wait(&my_wait, &wait);
#include <linux/slab.h>
void *kmalloc(size_t size, int flags);
kmem_cache_t *kmem_cache_create(
const char *name,
size_t size,
size_t offset,
unsigned long flags,
void (*constructor)(void *, kmem_cache_t *, unsigned long flags),
void (*destructor)(void *, kmem_cache_t *, unsigned long flags)
);
void *kmem_cache_alloc(kmem_cache_t *cache, int flags);
void kmem_cache_free(kmem_cache_t *cache, const void *obj);
int kmem_cache_destroy(kmem_cache_t *cache);
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data);
typedef void *(mempool_alloc_t)(int gfp_mask, void *pool_data);
typedef void (mempool_free_t)(void *element, void *pool_data);
cache = kmem_cache_create(. . .);
pool = mempool_create(MY_POOL_MINIMUM, mempool_alloc_slab, mempool_free_slab, cache);
void *mempool_alloc(mempool_t *pool, int gfp_mask);
void mempool_free(void *element, mempool_t *pool);
int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
void mempool_destroy(mempool_t *pool);
- to allocate big chunks of memory
get_zeroed_page(unsigned int flags);
__get_free_page(unsigned int flags);
__get_free_pages(unsigned int flags, unsigned int order); // log2N, 1 page is 0, 8 pages are 3
void free_page(unsigned long addr);
void free_pages(unsigned long addr, unsigned long order);
- vmalloc fammilies, don't use it, there is a overhead because of the page table
#include <linux/vmalloc.h>
void *vmalloc(unsigned long size);
void vfree(void * addr);
void *ioremap(unsigned long offset, unsigned long size);
void iounmap(void * addr);
$ less /proc/kallsyms
// at a compile time
DEFINE_PER_CPU(int[3], my_percpu_array);
DEFINE_PER_CPU(type, name);
// at a runtime
void *alloc_percpu(type);
void *_ _alloc_percpu(size_t size, size_t align);
get_cpu_var(sockets_in_use)++;
put_cpu_var(sockets_in_use);
per_cpu(variable, int cpu_id);
per_cpu_ptr(void *per_cpu_var, int cpu_id);
EXPORT_PER_CPU_SYMBOL(per_cpu_var);
EXPORT_PER_CPU_SYMBOL_GPL(per_cpu_var);
DECLARE_PER_CPU(type, name);
- obtaining large and contigious buffer at a boot time
#include <linux/bootmem.h>
void *alloc_bootmem(unsigned long size);
void *alloc_bootmem_low(unsigned long size);
void *alloc_bootmem_pages(unsigned long size);
void *alloc_bootmem_low_pages(unsigned long size);
void free_bootmem(unsigned long addr, unsigned long size);
- hardware registers and memory (RAM)
- most PCI devices map registers into a memory address region
- reordering CPU instractions by compiler optimization
- prevent to reorder and cache
- a memory barrier
- in x86, since writes outside the processor are not reorderd, reads are reordered
- wmb() does nothing
- mb() is slower than wmb()
#include <linux/kernel.h>
void barrier(void)
#include <asm/system.h>
void rmb(void);
void read_barrier_depends(void);
void wmb(void);
void mb(void);
void smp_rmb(void);
void smp_read_barrier_depends(void);
void smp_wmb(void);
void smp_mb(void);
writel(dev->registers.addr, io_destination_address);
writel(dev->registers.size, io_size);
writel(dev->registers.operation, DEV_READ);
wmb();
writel(dev->registers.control, DEV_GO);
#define set_mb(var, value) do {var = value; mb();} while 0
#define set_wmb(var, value) do {var = value; wmb();} while 0
#define set_rmb(var, value) do {var = value; rmb();} while 0
#include <linux/ioport.h>
struct resource *request_region(unsigned long first, unsigned long n, const char *name);
void release_region(unsigned long start, unsigned long n);
// deprecated, not in atomic manner
int check_region(unsigned long first, unsigned long n);
$ cat /proc/ioports
0000-0000 : dma1
0000-0000 : pic1
0000-0000 : timer0
0000-0000 : timer1
0000-0000 : keyboard
0000-0000 : keyboard
0000-0000 : rtc0
0000-0000 : dma page reg
0000-0000 : pic2
0000-0000 : dma2
0000-0000 : fpu
0000-0000 : ACPI PM1a_EVT_BLK
0000-0000 : ACPI PM1a_CNT_BLK
0000-0000 : ACPI PM_TMR
0000-0000 : ACPI GPE0_BLK
// for 8-bit
unsigned inb(unsigned port);
void outb(unsigned char byte, unsigned port);
// for 16-bit
unsigned inw(unsigned port);
void outw(unsigned short word, unsigned port);
// for 32-bit
unsigned inl(unsigned port);
void outl(unsigned longword, unsigned port);
// there is no 64-bit functions
- string operations
- keep in mind that byte-ordering
- also, needs pausing I/O with
inb_p
, outb_p
and so on, prevent to overclock processors
// 8-bit
void insb(unsigned port, void *addr, unsigned long count);
void outsb(unsigned port, void *addr, unsigned long count);
// 16-bit
void insw(unsigned port, void *addr, unsigned long count);
void outsw(unsigned port, void *addr, unsigned long count);
// 32-bit
void insl(unsigned port, void *addr, unsigned long count);
void outsl(unsigned port, void *addr, unsigned long count);
- the parallel port
- ECP, EPP modes
- 8-bit
0x378
, 0x278
- first port
- bidirectional data register
- it connects directly to pins 2-9 on the physical connector
- second port
- read-only status register
- e.g. printer, online, out of paper, busy
- third port
- output-only control register
- interrupts
- TTL, transister-transister logic
while (count--) {
outb(*(ptr++), port);
wmb();
}
#include <linux/ioport.h>
struct resource *request_mem_region(unsigned long start, unsigned long len, char *name);
void release_mem_region(unsigned long start, unsigned long len);
int check_mem_region(unsigned long start, unsigned long len); // deprecated, unsafe, old
#include <asm/io.h>
void *ioremap(unsigned long phys_addr, unsigned long size);
void *ioremap_nocache(unsigned long phys_addr, unsigned long size);
void iounmap(void *addr);
unsigned int ioread8(void *addr);
unsigned int ioread16(void *addr);
unsigned int ioread32(void *addr);
void iowrite8(u8 value, void *addr);
void iowrite16(u16 value, void *addr);
void iowrite32(u32 value, void *addr);
void ioread8_rep(void *addr, void *buf, unsigned long count);
void ioread16_rep(void *addr, void *buf, unsigned long count);
void ioread32_rep(void *addr, void *buf, unsigned long count);
void iowrite8_rep(void *addr, const void *buf, unsigned long count);
void iowrite16_rep(void *addr, const void *buf, unsigned long count);
void iowrite32_rep(void *addr, const void *buf, unsigned long count);
void memset_io(void *addr, u8 value, unsigned int count);
void memcpy_fromio(void *dest, void *source, unsigned int count);
void memcpy_toio(void *dest, void *source, unsigned int count);
$ cat /proc/iomem | head -n 2
00000000-00000000 : Reserved
00000000-00000000 : System RAM
// request region in advance
void *ioport_map(unsigned long port, unsigned int count);
void ioport_unmap(void *addr);
while (count--) {
iowrite8(*ptr++, address);
wmb();
}
#define ISA_BASE 0xA0000
#define ISA_MAX 0x100000 /* for general memory access */
/* this line appears in silly_init */
io_base = ioremap(ISA_BASE, ISA_MAX - ISA_BASE);
int request_irq(unsigned int irq,
irqreturn_t (*handler)(int, void *, struct pt_regs *),
unsigned long flags,
const char *dev_name,
void *dev_id);
void free_irq(unsigned int irq, void *dev_id);
int can_request_irq(unsigned int irq, unsigned long flags);
- to contribute to the system entropy pool
$ cat /proc/interrupts
CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 CPU6 CPU7
8: 0 0 0 0 0 0 0 0 IO-APIC 8-edge rtc0
9: 6 0 0 0 0 0 0 0 IO-APIC 9-fasteoi acpi
24: 0 1 0 0 0 0 0 0 Hyper-V PCIe MSI 2818572288-edge virtio0-config
25: 0 0 1327 0 0 0 0 0 Hyper-V PCIe MSI 2818572289-edge virtio0-virtqueues
26: 0 0 0 0 0 0 0 0 Hyper-V PCIe MSI 1879048192-edge virtio1-config
27: 0 0 0 0 0 181 0 0 Hyper-V PCIe MSI 1879048193-edge virtio1-requests
28: 0 0 0 0 0 0 1 0 Hyper-V PCIe MSI 3221225472-edge virtio2-config
29: 0 0 0 0 0 0 0 1 Hyper-V PCIe MSI 3221225473-edge virtio2-hiprio
30: 10 0 0 0 0 0 0 0 Hyper-V PCIe MSI 3221225474-edge virtio2-requests.0
31: 0 0 0 0 0 0 0 0 Hyper-V PCIe MSI 2684354560-edge virtio3-config
32: 0 0 3 0 0 0 0 0 Hyper-V PCIe MSI 2684354561-edge virtio3-requests
33: 0 0 0 0 0 0 0 0 Hyper-V PCIe MSI 2818572288-edge virtio4-config
34: 0 0 0 0 0 0 0 0 Hyper-V PCIe MSI 2818572289-edge virtio4-requests
NMI: 0 0 0 0 0 0 0 0 Non-maskable interrupts
LOC: 0 0 0 0 0 0 0 0 Local timer interrupts
SPU: 0 0 0 0 0 0 0 0 Spurious interrupts
PMI: 0 0 0 0 0 0 0 0 Performance monitoring interrupts
IWI: 1 0 0 0 0 0 0 0 IRQ work interrupts
RTR: 0 0 0 0 0 0 0 0 APIC ICR read retries
RES: 3730 2322 4591 2898 4379 2173 4061 2478 Rescheduling interrupts
CAL: 134428 144302 142620 104106 137352 93561 135487 132745 Function call interrupts
TLB: 0 0 0 0 0 0 0 0 TLB shootdowns
TRM: 0 0 0 0 0 0 0 0 Thermal event interrupts
HYP: 154677 77066 1558 5878 3027 663 11278 759 Hypervisor callback interrupts
HRE: 0 0 0 0 0 0 0 0 Hyper-V reenlightenment interrupts
HVS: 512616 231656 458997 262581 432344 199343 486263 194674 Hyper-V stimer0 interrupts
ERR: 0
MIS: 0
PIN: 0 0 0 0 0 0 0 0 Posted-interrupt notification event
NPI: 0 0 0 0 0 0 0 0 Nested posted-interrupt event
PIW: 0 0 0 0 0 0 0 0 Posted-interrupt wakeup event
$ cat /proc/stat
cpu 248635 645 41321 68260050 7161 0 4129 0 0 0
cpu0 31514 84 6070 8529837 756 0 2522 0 0 0
cpu1 38679 64 4976 8524547 2117 0 1095 0 0 0
cpu2 30271 35 6199 8531722 904 0 236 0 0 0
cpu3 29410 100 4639 8535816 404 0 127 0 0 0
cpu4 33538 106 6229 8528750 584 0 43 0 0 0
cpu5 23946 170 3183 8542885 445 0 51 0 0 0
cpu6 32950 75 6566 8528753 808 0 43 0 0 0
cpu7 28327 11 3459 8537737 1140 0 12 0 0 0
intr 1088268 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1327 0 181 1 1 10 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
ctxt 5650638
btime 1681698117
processes 20053
procs_running 1
procs_blocked 0
softirq 5528223 0 1069762 5 38472 0 0 232517 1591526 103 2595838
- autodetecting the IRQ number
- kernel provices helper functions
- probing by doing it yourself
#include <linux/interrupt.h>
unsigned long probe_irq_on(void);
int probe_irq_off(unsigned long);
irqreturn_t short_interrupt(int irq, void *dev_id, struct pt_regs *regs) {
struct timeval tv;
int written;
do_gettimeofday(&tv);
/* Write a 16 byte record. Assume PAGE_SIZE is a multiple of 16 */
written = sprintf(
(char *)short_head,"%08u.%06u\n",
(int)(tv.tv_sec % 100000000),
(int)(tv.tv_usec)
);
BUG_ON(written != 16);
short_incr_bp(&short_head, written);
wake_up_interruptible(&short_queue); /* awake any reading process */
return IRQ_HANDLED;
}
static inline void short_incr_bp(volatile unsigned long *index, int delta) {
unsigned long new = *index + delta;
barrier(); /* Don't optimize these two together */
*index = (new >= (short_buffer + PAGE_SIZE)) ? short_buffer : new;
}
- PIC ... programmable interrupt controller
- splitting the interruput handler into two halves
- top-half handler
- responds to the interrupt
- bottom-half handler
- is scheduled by top-half handler and is executed later at a safer time
- tasklets, workqueues
- shared interrupts
- a shared handler must be able to recognize its own interrupts and should quickly exit by returning
IRQ_NONE
when its own device has not interrupt
- three types
- standard C types, such as
int
- don't use it when you need "a 2-byte filter" or "4-byte string"
- explicity sized types, such as
u32
- types used for specific kernel objects, such as
pid_t
- it has gone out of favor among many kernel developers
- portability issues
#include <asm/page.h>
int order = get_order(16*1024);
buf = get_free_pages(GFP_KERNEL, order);
u32 cpu_to_le32 (u32);
u32 le32_to_cpu (u32);
- data alignment
- there is a greate perfoamance penalty if you need to access unaligned data
- structure fields alignments
- you should enforce natural alignment to prevent the compiler to arrange the fields in unpredictable ways
- you should use filler fields
- padding insertion
#include <asm/unaligned.h>
get_unaligned(ptr);
put_unaligned(val, ptr);
- pointers and error values
void *ERR_PTR(long error);
long IS_ERR(const void *ptr);
long PTR_ERR(const void *ptr);
- linked list
- use the following facility
struct list_head {
struct list_head *next, *prev;
};
// embed the above like this:
struct todo_struct {
struct list_head list;
int priority; /* driver specific */
/* ... add other driver-specific fields */
};
struct list_head todo_list;
INIT_LIST_HEAD(&todo_list); // runtime
LIST_HEAD(todo_list); // compile time
- PCI ... Perripheral Component Interconnect
- better performance, higher clock rate than ISA
- jumperless, autodetected at boot time
- identified by a bus number, a device number and a function number
$ lspci
18b5:00:00.0 System peripheral: Red Hat, Inc. Virtio file system (rev 01)
1fe5:00:00.0 3D controller: Microsoft Corporation Device 008e
538d:00:00.0 SCSI storage controller: Red Hat, Inc. Virtio filesystem (rev 01)
aec5:00:00.0 SCSI storage controller: Red Hat, Inc. Virtio filesystem (rev 01)
c2a1:00:00.0 SCSI storage controller: Red Hat, Inc. Virtio console (rev 01)
c546:00:00.0 SCSI storage controller: Red Hat, Inc. Virtio filesystem (rev 01)
$ ll /proc/bus/pci/
total 0
dr-xr-xr-x 9 root root 0 Apr 27 12:44 ./
dr-xr-xr-x 4 root root 0 Apr 27 12:44 ../
dr-xr-xr-x 3 root root 0 Apr 27 12:44 18b5:00/
dr-xr-xr-x 3 root root 0 Apr 27 12:44 1fe5:00/
dr-xr-xr-x 3 root root 0 Apr 27 12:44 538d:00/
dr-xr-xr-x 3 root root 0 Apr 27 12:44 aec5:00/
dr-xr-xr-x 3 root root 0 Apr 27 12:44 c2a1:00/
dr-xr-xr-x 3 root root 0 Apr 27 12:44 c546:00/
-r--r--r-- 1 root root 0 Apr 27 12:44 devices
$ cat /proc/bus/pci/devices | cut -f1
0000
0000
0000
0000
0000
0000
$ tree /sys/bus/pci/devices/
/sys/bus/pci/devices/
├── 18b5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/46d9ad56-18b5-4d77-b62f-0b5891160bb1/pci18b5:00/18b5:00:00.0
├── 1fe5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/502c91c3-1fe5-4692-9534-b0c68e0b8170/pci1fe5:00/1fe5:00:00.0
├── 538d:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/400930b1-538d-4d70-9538-26c4b4e104a7/pci538d:00/538d:00:00.0
├── aec5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/0399541e-aec5-4829-b1f1-6154168c0f99/pciaec5:00/aec5:00:00.0
├── c2a1:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/8ff4cdbd-c2a1-488e-8af1-b07e7ba7d476/pcic2a1:00/c2a1:00:00.0
└── c546:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/9fb76897-c546-420b-9873-997bc89c414d/pcic546:00/c546:00:00.0
6 directories, 0 files
- three address spaces
- memory locations ... shared
- I/O ports ... shared
- configuration registers ... geographical addressing
- every PCI slot has four interrupt pins
- the firmware initializes PCI hardware at system boot, mapping each region to a different address to avoid collisions.
- https://github.com/pciutils/pciutils/blob/master/pci.ids
- boot time
struct pci_device_id foo;
// specify PCI_ANY_ID if a driver can handle any type of it
PCI_DEVICE(vendor, device);
PCI_DEVICE_CLASS(device_class, device_class_mask);
- PCI hotplug
/lib/modules/KERNEL_VERSION/modules.pcimap
MODULE_DEVICE_TABLE(pci, i810_ids)
- registering a PCI driver
- name
- id_table
- probe()
- remove()
- suspend() ... optional
- resume() ... optional
static struct pci_driver pci_driver = {
.name = "pci_skel",
.id_table = ids,
.probe = probe,
.remove = remove,
};
static int __init pci_skel_init(void) {
return pci_register_driver(&pci_driver);
}
static void __exit pci_skel_exit(void) {
pci_unregister_driver(&pci_driver);
}
int pci_enable_device(struct pci_dev *dev);
- accessing the configuration space
int pci_read_config_byte(struct pci_dev *dev, int where, u8 *val);
int pci_read_config_word(struct pci_dev *dev, int where, u16 *val); // byte-order
int pci_read_config_dword(struct pci_dev *dev, int where, u32 *val); // byte-order
int pci_write_config_byte(struct pci_dev *dev, int where, u8 val);
int pci_write_config_word(struct pci_dev *dev, int where, u16 val); // byte-order
int pci_write_config_dword(struct pci_dev *dev, int where, u32 val); // byte-order
static unsigned char skel_get_revision(struct pci_dev *dev) {
u8 revision;
pci_read_config_byte(dev, PCI_REVISION_ID, &revision);
return revision;
}
- accesing the I/O and memory spaces
unsigned long pci_resource_start(struct pci_dev *dev, int bar);
unsigned long pci_resource_end(struct pci_dev *dev, int bar);
unsigned long pci_resource_flags(struct pci_dev *dev, int bar);
result = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &myirq);
if (result) {
/* deal with error */
}
- hardware abstraction
- object oriented layout
- like the file operations
- the usual structure containing methods
- it adds just the minimal overhead of dereferencing a pointer to the normal overhead of a function call
struct pci_ops {
int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val);
int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val);
};
dev->bus->ops->read(bus, devfn, where, 8, val);
- PnP ... Plug-and-Play
- PC/104 ... ISA
- PC/104+ ... PCI
- other buses
- MCA ... micro channel architecture
- EISA ... extended ISA
- VLB ... VESA local bus
- SBus ... SPARC-based workstations
- MMU ... memory management unit
- NuBus
- external buses
- USB, FireWire and IEEE1284
- USB ... universal serial bus
- User
- Kernel
- VFS layers, block layers, net layers, char layers, TTY layers, ...
- USB device drivers
- USB core
- USB host controllers
- Hardware
- https://www.usb.org/
- Device
- Config
- Interface <- USB driver
- Endpoint
- Endpoint
- Endpoint
- Interface <- USB driver
- Endpoint
- Endpoint
- Endpoint
- Endpoint
- OUT endpoint
- IN endpoint
- unidirectional
- four types
- CONTROL
- INTERRUPT
- BULK
- printers, storage, network devices
- ISOCHRONOUS
struct usb_host_endpoint foo;
struct usb_endpoint_descriptor bar;
- each USB driver controls an interface
- e.g. Linux needs two diferent drivers for one speakers
struct usb_interface baz;
struct usb_host_config zap;
struct usb_device quax;
$ find /sys/devices/ -name *usb*
/sys/devices/platform/vhci_hcd.0/usbip_debug
/sys/devices/platform/vhci_hcd.0/usb1
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port3
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port1
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port8
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port6
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port4
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port2
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port7
/sys/devices/platform/vhci_hcd.0/usb1/1-0:1.0/usb1-port5
/sys/devices/platform/vhci_hcd.0/usb2
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port7
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port7/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port5
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port5/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port3
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port3/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port1
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port1/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port8
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port8/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port6
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port6/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port4
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port4/usb3_lpm_permit
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port2
/sys/devices/platform/vhci_hcd.0/usb2/2-0:1.0/usb2-port2/usb3_lpm_permit
root_hub-hub_port:config.interface
$ lsusb
Bus 002 Device 001: ID 1d6b:0003 Linux Foundation 3.0 root hub
Bus 001 Device 001: ID 1d6b:0002 Linux Foundation 2.0 root hub
- urb ... USB request block
- https://github.com/torvalds/linux/blob/master/include/linux/usb.h
- Created by a USB device driver.
- Assigned to a specific endpoint of a specific USB device.
- Submitted to the USB core, by the USB device driver.
- Submitted to the specific USB host controller driver for the specified device by the USB core.
- Processed by the USB host controller driver that makes a USB transfer to the device.
- When the urb is completed, the USB host controller driver notifies the USB device driver.
- do not create urb statically
- you can only use the following constructor, to prevent internal counter of references
struct urb *usb_alloc_urb(int iso_packets, int mem_flags);
void usb_free_urb(struct urb *urb);
void usb_fill_int_urb(
struct urb *urb,
struct usb_device *dev,
unsigned int pipe,
void *transfer_buffer,
int buffer_length,
usb_complete_t complete,
void *context,
int interval
);
void usb_fill_bulk_urb(
struct urb *urb,
struct usb_device *dev,
unsigned int pipe,
void *transfer_buffer,
int buffer_length,
usb_complete_t complete,
void *context
);
void usb_fill_control_urb(
struct urb *urb,
struct usb_device *dev,
unsigned int pipe,
unsigned char *setup_packet,
void *transfer_buffer,
int buffer_length,
usb_complete_t complete,
void *context
);
// isochronous urbs are only initialized by hand like this:
urb->dev = dev;
urb->context = uvd;
urb->pipe = usb_rcvisocpipe(dev, uvd->video_endp-1);
urb->interval = 1;
urb->transfer_flags = URB_ISO_ASAP;
urb->transfer_buffer = cam->sts_buf[i];
urb->complete = konicawc_isoc_irq;
urb->number_of_packets = FRAMES_PER_DESC;
urb->transfer_buffer_length = FRAMES_PER_DESC;
for (j=0; j < FRAMES_PER_DESC; j++) {
urb->iso_frame_desc[j].offset = j;
urb->iso_frame_desc[j].length = 1;
}
int usb_submit_urb(struct urb *urb, int mem_flags);
// GFP_ATOMIC, GFP_NOIO, GFP_KERNEL
int usb_kill_urb(struct urb *urb);
int usb_unlink_urb(struct urb *urb);
/* table of devices that work with this driver */
static struct usb_device_id skel_table[] = {
{ USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) },
{} /* Terminating entry */
};
MODULE_DEVICE_TABLE(usb, skel_table);
static struct usb_device_id usb_ids[] = {
{.driver_info = 42},
{} /* Terminating entry */
};
static struct usb_driver skel_driver = {
.owner = THIS_MODULE,
.name = "skeleton",
.id_table = skel_table,
.probe = skel_probe,
.disconnect = skel_disconnect,
};
static int __init usb_skel_init(void) {
int result;
/* register this driver with the USB subsystem */
result = usb_register(&skel_driver);
if (result) {
err("usb_register failed. Error number %d", result);
}
return result;
}
static void __exit usb_skel_exit(void) {
/* deregister this driver with the USB subsystem */
usb_deregister(&skel_driver);
}
- also, you can transfer data without urbs via some helper functions
- usb_bulk_msg
- usb_control_msg
- usb_get_descriptor
- usb_string, usb_get_string
- etc...
- the linux device model
- Kobjects, Ksets, Subsystems
struct cdev {
struct kobject kobj;
struct module *owner;
struct file_operations *ops;
struct list_head list;
dev_t dev;
unsigned int count;
};
// back-casting
struct cdev *device = container_of(kp, struct cdev, kobj);
// initializing, you should do zeroing in advance
// side-effect: increments the reference count
void kobject_init(struct kobject *kobj);
int kobject_set_name(struct kobject *kobj, const char *format, ...);
// returnts the address or NULL, increments the reference count
struct kobject *kobject_get(struct kobject *kobj);
// decrements the reference count
void kobject_put(struct kobject *kobj);
struct kobj_type {
void (*release)(struct kobject *);
struct sysfs_ops *sysfs_ops;
struct attribute **default_attrs;
};
struct kobj_type *get_ktype(struct kobject *kobj);
int kobject_add(struct kobject *kobj);
void kobject_del(struct kobject *kobj);
// kobject_init and kobject_add
extern int kobject_register(struct kobject *kobj);
// kobject_del and kobject_put
kobject_unregister
void kset_init(struct kset *kset);
int kset_add(struct kset *kset);
int kset_register(struct kset *kset);
void kset_unregister(struct kset *kset);
struct kset *kset_get(struct kset *kset);
void kset_put(struct kset *kset);
kobject_set_name(&my_set->kobj, "The name");
- subsystems
- block_subsys
/sys/block
- device_subsys
/sys/devices
struct subsystem {
struct kset kset;
struct rw_semaphore rwsem;
};
decl_subsys(name, struct kobj_type *type, struct kset_hotplug_ops *hotplug_ops);
void subsystem_init(struct subsystem *subsys);
int subsystem_register(struct subsystem *subsys);
void subsystem_unregister(struct subsystem *subsys);
struct subsystem *subsys_get(struct subsystem *subsys)
void subsys_put(struct subsystem *subsys);
- kobjects are the mechanism behind the sysfs virtual filesystem.
- attributes
struct attribute {
char *name;
struct module *owner;
mode_t mode; // the last entry must be zero-filled
};
struct sysfs_ops {
ssize_t (*show)(struct kobject *kobj, struct attribute *attr, char *buffer);
ssize_t (*store)(struct kobject *kobj, struct attribute *attr, const char *buffer, size_t size);
};
int sysfs_create_file(struct kobject *kobj, struct attribute *attr);
int sysfs_remove_file(struct kobject *kobj, struct attribute *attr);
// when user want to upload the firmware
struct bin_attribute {
struct attribute attr;
size_t size;
ssize_t (*read)(struct kobject *kobj, char *buffer, loff_t pos, size_t size);
ssize_t (*write)(struct kobject *kobj, char *buffer, loff_t pos, size_t size);
};
int sysfs_create_bin_file(struct kobject *kobj, struct bin_attribute *attr);
int sysfs_remove_bin_file(struct kobject *kobj, struct bin_attribute *attr);
// linking between drivers and devices
int sysfs_create_link(struct kobject *kobj, struct kobject *target, char *name);
void sysfs_remove_link(struct kobject *kobj, char *name);
- hotplug event generation
- a notification to user space from kernel that something has changed in the system's configration
- e.g. loading drivers, creating device nodes, mounting partitions
/sbin/hotplug
struct kset_hotplug_ops {
int (*filter)(struct kset *kset, struct kobject *kobj);
char *(*name)(struct kset *kset, struct kobject *kobj);
int (*hotplug)(struct kset *kset, struct kobject *kobj, char **envp, int num_envp, char *buffer, int buffer_size);
};
static int block_hotplug_filter(struct kset *kset, struct kobject *kobj) {
struct kobj_type *ktype = get_ktype(kobj);
return ((ktype == &ktype_block) || (ktype == &ktype_part));
}
struct bus_type {
char *name;
struct subsystem subsys;
struct kset drivers;
struct kset devices;
int (*match)(struct device *dev, struct device_driver *drv);
struct device *(*add)(struct device * parent, char * bus_id);
int (*hotplug) (struct device *dev, char **envp, int num_envp, char *buffer, int buffer_size);
/* Some fields omitted */
};
int bus_register(struct bus_type *bus);
void bus_unregister(struct bus_type *bus);
struct bus_attribute {
struct attribute attr;
ssize_t (*show)(struct bus_type *bus, char *buf);
ssize_t (*store)(struct bus_type *bus, const char *buf, size_t count);
};
int bus_create_file(struct bus_type *bus, struct bus_attribute *attr);
void bus_remove_file(struct bus_type *bus, struct bus_attribute *attr);
struct device {
struct device *parent;
struct kobject kobj;
char bus_id[BUS_ID_SIZE];
struct bus_type *bus;
struct device_driver *driver;
void *driver_data;
void (*release)(struct device *dev);
/* Several fields omitted */
};
int device_register(struct device *dev);
void device_unregister(struct device *dev);
struct device_attribute {
struct attribute attr;
ssize_t (*show)(struct device *dev, char *buf);
ssize_t (*store)(struct device *dev, const char *buf, size_t count);
};
DEVICE_ATTR(name, mode, show, store);
int device_create_file(struct device *device, struct device_attribute *entry);
void device_remove_file(struct device *dev, struct device_attribute *attr);
struct device_driver {
char *name;
struct bus_type *bus;
struct kobject kobj;
struct list_head devices;
int (*probe)(struct device *dev);
int (*remove)(struct device *dev);
void (*shutdown) (struct device *dev);
};
int driver_register(struct device_driver *drv);
void driver_unregister(struct device_driver *drv);
struct driver_attribute {
struct attribute attr;
ssize_t (*show)(struct device_driver *drv, char *buf);
ssize_t (*store)(struct device_driver *drv, const char *buf, size_t count);
};
DRIVER_ATTR(name, mode, show, store);
int driver_create_file(struct device_driver *drv, struct driver_attribute *attr);
void driver_remove_file(struct device_driver *drv, struct driver_attribute *attr);
- classes
- e.g. disks
/sys/class
- interfaces
- class_simple
- regular class
struct class_simple *class_simple_create(struct module *owner, char *name);
void class_simple_destroy(struct class_simple *cs);
struct class_device *class_simple_device_add(
struct class_simple *cs,
dev_t devnum,
struct device *device,
const char *fmt,
...
);
int class_simple_set_hotplug(
struct class_simple *cs,
int (*hotplug)(struct class_device *dev, char **envp, int num_envp, char *buffer, int buffer_size)
);
void class_simple_device_remove(dev_t dev);
struct class {
char *name;
struct class_attribute *class_attrs;
struct class_device_attribute *class_dev_attrs;
int (*hotplug)(struct class_device *dev, char **envp, int num_envp, char *buffer, int buffer_size);
void (*release)(struct class_device *dev);
void (*class_release)(struct class *class);
/* Some fields omitted */
};
int class_register(struct class *cls);
void class_unregister(struct class *cls);
struct class_attribute {
struct attribute attr;
ssize_t (*show)(struct class *cls, char *buf);
ssize_t (*store)(struct class *cls, const char *buf, size_t count);
};
CLASS_ATTR(name, mode, show, store);
int class_create_file(struct class *cls, const struct class_attribute *attr);
void class_remove_file(struct class *cls, const struct class_attribute *attr);
struct class_device {
struct kobject kobj;
struct class *class;
struct device *dev;
void *class_data;
char class_id[BUS_ID_SIZE];
};
int class_device_register(struct class_device *cd);
void class_device_unregister(struct class_device *cd);
int class_device_rename(struct class_device *cd, char *new_name);
struct class_device_attribute {
struct attribute attr;
ssize_t (*show)(struct class_device *cls, char *buf);
ssize_t (*store)(struct class_device *cls, const char *buf, size_t count);
};
CLASS_DEVICE_ATTR(name, mode, show, store);
int class_device_create_file(struct class_device *cls, const struct class_device_attribute *attr);
void class_device_remove_file(struct class_device *cls, const struct class_device_attribute *attr);
struct class_interface {
struct class *class;
int (*add) (struct class_device *cd);
void (*remove) (struct class_device *cd);
};
int class_interface_register(struct class_interface *intf);
void class_interface_unregister(struct class_interface *intf);
- add a device
pci_bus_type
variable is registered with the driver core when the PCI subsystem is loaded in the kernel with a call to bus_register
- the driver core creates a sysfs directory in
/sys/bus/pci
that consists of two directories: drvices and drivers
- all PCI drivers must define a struct
pci_driver
variable
- the structure contains a struct
device_driver
initialized by the PCI core when the PCI driver is registered
driver_register
device_register
struct bus_type pci_bus_type = {
.name = "pci",
.match = pci_bus_match,
.hotplug = pci_hotplug,
.suspend = pci_device_suspend,
.resume = pci_device_resume,
.dev_attrs = pci_dev_attrs,
};
struct pci_dev {
/* ... */
unsigned int devfn;
unsigned short vendor;
unsigned short device;
unsigned short subsystem_vendor;
unsigned short subsystem_device;
unsigned int class;
/* ... */
struct pci_driver *driver;
/* ... */
struct device dev;
/* ... */
};
$ tree -d /sys/bus/pci
/sys/bus/pci
├── devices
│ ├── 18b5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/46d9ad56-18b5-4d77-b62f-0b5891160bb1/pci18b5:00/18b5:00:00.0
│ ├── 1fe5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/502c91c3-1fe5-4692-9534-b0c68e0b8170/pci1fe5:00/1fe5:00:00.0
│ ├── 538d:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/400930b1-538d-4d70-9538-26c4b4e104a7/pci538d:00/538d:00:00.0
│ ├── aec5:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/0399541e-aec5-4829-b1f1-6154168c0f99/pciaec5:00/aec5:00:00.0
│ ├── c2a1:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/8ff4cdbd-c2a1-488e-8af1-b07e7ba7d476/pcic2a1:00/c2a1:00:00.0
│ └── c546:00:00.0 -> ../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/9fb76897-c546-420b-9873-997bc89c414d/pcic546:00/c546:00:00.0
├── drivers
│ ├── dxgkrnl
│ │ ├── 1fe5:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/502c91c3-1fe5-4692-9534-b0c68e0b8170/pci1fe5:00/1fe5:00:00.0
│ │ └── module -> ../../../../module/dxgkrnl
│ ├── pcieport
│ ├── serial
│ ├── vfio-pci
│ │ └── module -> ../../../../module/vfio_pci
│ └── virtio-pci
│ ├── 18b5:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/46d9ad56-18b5-4d77-b62f-0b5891160bb1/pci18b5:00/18b5:00:00.0
│ ├── 538d:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/400930b1-538d-4d70-9538-26c4b4e104a7/pci538d:00/538d:00:00.0
│ ├── aec5:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/0399541e-aec5-4829-b1f1-6154168c0f99/pciaec5:00/aec5:00:00.0
│ ├── c2a1:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/8ff4cdbd-c2a1-488e-8af1-b07e7ba7d476/pcic2a1:00/c2a1:00:00.0
│ ├── c546:00:00.0 -> ../../../../devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0004:00/VMBUS:00/9fb76897-c546-420b-9873-997bc89c414d/pcic546:00/c546:00:00.0
│ └── module -> ../../../../module/virtio_pci
└── slots
├── 1074344113
├── 1188670806
├── 1345098179
├── 2415185341
├── 2679597207
└── 60380190
29 directories
- remove a deveice
- fakephp.c
pci_remove_bus_device
- add a driver
pci_register_driver
struct device_driver
driver_register
- remove a driver
down(&drv->unload_sem);
up(&drv->unload_sem);
result = readl(ptr);
if (result == ~(u32)0) {
return -ENODEV; /* card removed */
}
DIR="/etc/hotplug.d"
for I in "${DIR}/$1/"*.hotplug "${DIR}/"default/*.hotplug ; do
if [ -f $I ]; then
test -x $I && $I $1 ;
fi
done
exit 1
- environment variables
- IEEE1394 (FireWire)
- VENDOR_ID
- MODEL_ID
- GUID
- SPECIFIER_ID
- VERSION
- Networking
- PCI
- PCI_CLASS
- PCI_ID
- PCI_SUBSYSTEM_ID
- subsys_vendor:subsys_device
- PCI_SLOT_NAME
- Input (mice, keyboards, joysticks, etc.)
- PRODUCT
- bustype:vendor:product:version hex
- if the device supports:
- NAME
- PHYS
- EV
- KEY
- REL
- ABS
- MSC
- LED
- SND
- FF
- USB
- PRODUCT
- idVendor/idProduct/bcdDevice
- TYPE
- bDeviceClass/bDeviceSubClass/bDeviceProtocol
- if the bDeviceClass field is set to 0:
- INTERFACE
- bInterfaceClass/bInterfaceSubClass/bInterfaceProtocol
- if the kernel build option, CONFIG_USB_DEVICEFS
- DEVICE
- /proc/bus/usb/USB_BUS_NUMBER/USB_DEVICE_NUMBER
- SCSI
- Laptop docking stations
- S/390 and zSeries
- Linux hotplug scripts
MODULE_DEVICE_TABLE
/lib/module/KERNEL_VERSION/modules.*map
- udev
class_simple_create
class_simple_device_add
class_simple_device_remove
class_simple_destroy
- dealing with firmware
- EEPROM
- electrically erasable programmable read-only memory
- the kernel firmware interface
#include <linux/firmware.h>
int request_firmware(const struct firmware **fw, char *name, struct device *device);
struct firmware {
size_t size;
u8 *data;
};
void release_firmware(struct firmware *fw);
int request_firmware_nowait(
struct module *module,
char *name,
struct device *device,
void *context,
void (*cont)(const struct firmware *fw, void *context)
);
- memory mapping
- user virtual addresses
- physical addresses
- between the processor and the system's memory
- bus addresses
- between peripheral buses and memory
- IOMMU ... I/O memory management unit
- kernel logical addresses
- kernel virtual addresses
- physical addresses and pages
- PFN ... page frame number
PAGE_SHIFT
- high and low memory
struct page *virt_to_page(void *kaddr);
struct page *pfn_to_page(int pfn); // page frame number
void *page_address(struct page *page); // use kmap instead
void *kmap(struct page *page);
void kunmap(struct page *page);
void *kmap_atomic(struct page *page, enum km_type type);
void kunmap_atomic(void *addr, enum km_type type);
- page tables
- the processor must have a mechanism for translating virtual addresses into its corresponding pyysical addresses
- on any modern system
- that is called page table
- a multilevel tree-structured array
- virtual memory areas (VMA
- text ... the program's executable code
- multible areas for data
- initialized data
- uninitialized data
- BBS .. block started by symbol
- the program stack
- one area for each active memory mapping
/proc/<pid>/maps
/proc/self
- start-end perm offset major:minor inode image
- perm's
p
means private
- confusingly, for device mappings, the major and minor numbers refer to the disk partition holding the device special file that was opened by the user, and not the device itself
struct vm_area_struct
- to support
mmap
for user
- the process memory map
$ cat /proc/self/maps
55ecc9c26000-55ecc9c28000 r--p 00000000 08:20 14816 /usr/bin/cat
55ecc9c28000-55ecc9c2c000 r-xp 00002000 08:20 14816 /usr/bin/cat
55ecc9c2c000-55ecc9c2e000 r--p 00006000 08:20 14816 /usr/bin/cat
55ecc9c2e000-55ecc9c2f000 r--p 00007000 08:20 14816 /usr/bin/cat
55ecc9c2f000-55ecc9c30000 rw-p 00008000 08:20 14816 /usr/bin/cat
55ecca6ba000-55ecca6db000 rw-p 00000000 00:00 0 [heap]
7fdd4762f000-7fdd47651000 rw-p 00000000 00:00 0
7fdd47651000-7fdd476a8000 r--p 00000000 08:20 42519 /usr/lib/locale/C.utf8/LC_CTYPE
7fdd476a8000-7fdd476a9000 r--p 00000000 08:20 42552 /usr/lib/locale/C.utf8/LC_NUMERIC
7fdd476a9000-7fdd476aa000 r--p 00000000 08:20 42569 /usr/lib/locale/C.utf8/LC_TIME
7fdd476aa000-7fdd476ab000 r--p 00000000 08:20 40718 /usr/lib/locale/C.utf8/LC_COLLATE
7fdd476ab000-7fdd476ac000 r--p 00000000 08:20 42536 /usr/lib/locale/C.utf8/LC_MONETARY
7fdd476ac000-7fdd476ad000 r--p 00000000 08:20 42533 /usr/lib/locale/C.utf8/LC_MESSAGES/SYS_LC_MESSAGES
7fdd476ad000-7fdd476ae000 r--p 00000000 08:20 42555 /usr/lib/locale/C.utf8/LC_PAPER
7fdd476ae000-7fdd476af000 r--p 00000000 08:20 42540 /usr/lib/locale/C.utf8/LC_NAME
7fdd476af000-7fdd476b0000 r--p 00000000 08:20 40486 /usr/lib/locale/C.utf8/LC_ADDRESS
7fdd476b0000-7fdd47999000 r--p 00000000 08:20 13525 /usr/lib/locale/locale-archive
7fdd47999000-7fdd4799c000 rw-p 00000000 00:00 0
7fdd4799c000-7fdd479c4000 r--p 00000000 08:20 39598 /usr/lib/x86_64-linux-gnu/libc.so.6
7fdd479c4000-7fdd47b59000 r-xp 00028000 08:20 39598 /usr/lib/x86_64-linux-gnu/libc.so.6
7fdd47b59000-7fdd47bb1000 r--p 001bd000 08:20 39598 /usr/lib/x86_64-linux-gnu/libc.so.6
7fdd47bb1000-7fdd47bb5000 r--p 00214000 08:20 39598 /usr/lib/x86_64-linux-gnu/libc.so.6
7fdd47bb5000-7fdd47bb7000 rw-p 00218000 08:20 39598 /usr/lib/x86_64-linux-gnu/libc.so.6
7fdd47bb7000-7fdd47bc4000 rw-p 00000000 00:00 0
7fdd47bc4000-7fdd47bc5000 r--p 00000000 08:20 42563 /usr/lib/locale/C.utf8/LC_TELEPHONE
7fdd47bc5000-7fdd47bc6000 r--p 00000000 08:20 42522 /usr/lib/locale/C.utf8/LC_MEASUREMENT
7fdd47bc6000-7fdd47bcd000 r--s 00000000 08:20 42977 /usr/lib/x86_64-linux-gnu/gconv/gconv-modules.cache
7fdd47bcd000-7fdd47bcf000 rw-p 00000000 00:00 0
7fdd47bcf000-7fdd47bd1000 r--p 00000000 08:20 10010 /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
7fdd47bd1000-7fdd47bfb000 r-xp 00002000 08:20 10010 /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
7fdd47bfb000-7fdd47c06000 r--p 0002c000 08:20 10010 /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
7fdd47c06000-7fdd47c07000 r--p 00000000 08:20 42521 /usr/lib/locale/C.utf8/LC_IDENTIFICATION
7fdd47c07000-7fdd47c09000 r--p 00037000 08:20 10010 /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
7fdd47c09000-7fdd47c0b000 rw-p 00039000 08:20 10010 /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
7ffe673cf000-7ffe673f0000 rw-p 00000000 00:00 0 [stack]
7ffe673f0000-7ffe673f4000 r--p 00000000 00:00 0 [vvar]
7ffe673f4000-7ffe673f6000 r-xp 00000000 00:00 0 [vdso]
$ cat /proc/iomem
00000000-00000000 : Reserved
00000000-00000000 : System RAM
00000000-00000000 : Reserved
00000000-00000000 : System ROM
00000000-00000000 : ACPI Tables
00000000-00000000 : System RAM
00000000-00000000 : Kernel code
00000000-00000000 : Kernel rodata
00000000-00000000 : Kernel data
00000000-00000000 : Kernel bss
00000000-00000000 : 8ff4cdbd-c2a1-488e-8af1-b07e7ba7d476
00000000-00000000 : 502c91c3-1fe5-4692-9534-b0c68e0b8170
00000000-00000000 : 0399541e-aec5-4829-b1f1-6154168c0f99
00000000-00000000 : 46d9ad56-18b5-4d77-b62f-0b5891160bb1
00000000-00000000 : 400930b1-538d-4d70-9538-26c4b4e104a7
00000000-00000000 : 9fb76897-c546-420b-9873-997bc89c414d
00000000-00000000 : PNP0003:00
00000000-00000000 : Local APIC
00000000-00000000 : PNP0003:00
00000000-00000000 : System RAM
00000000-00000000 : 8ff4cdbd-c2a1-488e-8af1-b07e7ba7d476
00000000-00000000 : c2a1:00:00.0
00000000-00000000 : virtio-pci-modern
00000000-00000000 : c2a1:00:00.0
00000000-00000000 : c2a1:00:00.0
00000000-00000000 : virtio-pci-modern
00000000-00000000 : 0399541e-aec5-4829-b1f1-6154168c0f99
00000000-00000000 : aec5:00:00.0
00000000-00000000 : virtio-pci-modern
00000000-00000000 : aec5:00:00.0
00000000-00000000 : aec5:00:00.0
00000000-00000000 : virtio-pci-modern
00000000-00000000 : 400930b1-538d-4d70-9538-26c4b4e104a7
00000000-00000000 : 538d:00:00.0
00000000-00000000 : virtio-pci-modern
00000000-00000000 : 538d:00:00.0
00000000-00000000 : 538d:00:00.0
00000000-00000000 : virtio-pci-modern
00000000-00000000 : 9fb76897-c546-420b-9873-997bc89c414d
00000000-00000000 : c546:00:00.0
00000000-00000000 : virtio-pci-modern
00000000-00000000 : c546:00:00.0
00000000-00000000 : c546:00:00.0
00000000-00000000 : virtio-pci-modern
00000000-00000000 : 711dad3a-73ce-468b-90a9-ede6906841b2
00000000-00000000 : 46d9ad56-18b5-4d77-b62f-0b5891160bb1
00000000-00000000 : 18b5:00:00.0
00000000-00000000 : virtio2
00000000-00000000 : 18b5:00:00.0
00000000-00000000 : virtio-pci-modern
00000000-00000000 : 18b5:00:00.0
- the mmap device operation
mmap (caddr_t addr, size_t len, int prot, int flags, int fd, off_t offset)
int (*mmap) (struct file *filp, struct vm_area_struct *vma);
// refers to actual system RAM
int remap_pfn_range(
struct vm_area_struct *vma,
unsigned long virt_addr,
unsigned long pfn, // page frame number, vma->vm_pgoff
unsigned long size,
pgprot_t prot // protection, vma->vm_page_prot
);
// phys_addr points to I/O memory
int io_remap_page_range(
struct vm_area_struct *vma,
unsigned long virt_addr,
unsigned long phys_addr,
unsigned long size,
pgprot_t prot
);
- references to device memory should not be cached by the processor
- often the system BIOS sets things up properly, but
pgprot_noncached
- simple
- Simple Implementation Mapping Pages with Little Enthusiasm
static int simple_remap_mmap(struct file *filp, struct vm_area_struct *vma) {
if (remap_pfn_range(vma, vma->vm_start, vm->vm_pgoff, vma->vm_end - vma->vm_start, vma->vm_page_prot)) {
return -EAGAIN;
}
vma->vm_ops = &simple_remap_vm_ops;
simple_vma_open(vma);
return 0;
}
void simple_vma_open(struct vm_area_struct *vma) {
printk(KERN_NOTICE "Simple VMA open, virt %lx, phys %lx\n", vma->vm_start, vma->vm_pgoff << PAGE_SHIFT);
}
void simple_vma_close(struct vm_area_struct *vma) {
printk(KERN_NOTICE "Simple VMA close.\n");
}
static struct vm_operations_struct simple_remap_vm_ops = {
.open = simple_vma_open,
.close = simple_vma_close,
};
vma->vm_ops = &simple_remap_vm_ops;
simple_vma_open(vma);
- implement
nopage
- to support
mremap
- you must use
remap_pfn_range
for the PCI bus
nopage
does not work with PCI memory areas, extention of PCI mappings is not possible
struct page *(*nopage)(struct vm_area_struct *vma, unsigned long address, int *type);
get_page(struct page *pageptr);
unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
unsigned long physical = simple_region_start + off;
unsigned long vsize = vma->vm_end - vma->vm_start;
unsigned long psize = simple_region_size - off;
if (vsize > psize) {
return -EINVAL; /* spans too high */
}
remap_pfn_range(vma, vma_>vm_start, physical, vsize, vma->vm_page_prot);
// prevent to map the zero page if over
struct page *simple_nopage(struct vm_area_struct *vma, unsigned long address, int *type) {
return NOPAGE_SIGBUS; /* send a SIGBUS */
}
- remapping RAM
- remapping kernel virtual addresses
- rarely necessary
vmalloc_to_page
- performing direct I/O
#include <linux/mm.h>
int get_user_pages(
struct task_struct *tsk,
struct mm_struct *mm,
unsigned long start,
int len,
int write,
int force,
struct page **pages,
struct vm_area_struct **vmas
);
down_read(¤t->mm->mmap_sem);
result = get_user_pages(current, current->mm, ...);
up_read(¤t->mm->mmap_sem);
#include <linux/page-flags.h>
void SetPageDirty(struct page *page);
if (!PageReserved(page)) {
SetPageDirty(page);
}
void page_cache_release(struct page *page);
ssize_t (*aio_read) (struct kiocb *iocb, char *buffer, size_t count, loff_t offset);
ssize_t (*aio_write) (struct kiocb *iocb, const char *buffer, size_t count, loff_t offset);
int (*aio_fsync) (struct kiocb *iocb, int datasync);
// the kernel ocasionaly creates synchronous IOCBs, your driver should query it:
int is_sync_kiocb(struct kiocb *iocb);
// if this function returns a nonzero value, your driver must execute the operation synchronously
int aio_complete(struct kiocb *iocb, long res, long res2);
// if you need a huge buffer, but it is discouraged
dmabuf = ioremap(0xFF00000 /* 255M */, 0x100000 /* 1M */);
// the use of these functions is strongly discouraged, use the generic DMA layer instead
unsigned long virt_to_bus(volatile void *address);
void *bus_to_virt(unsigned long address);
// if your device not support 32-bit DMA operations
int dma_set_mask(struct device *dev, u64 mask);
if (dma_set_mask(dev, 0xffffff)) {
card->use_dma = 1;
} else {
card->use_dma = 0; /* We'll have to live without DMA */
printk(KERN_WARN, "mydev: DMA not supported\n");
}
virt_io_bus
is not suitable, IOMMU
- a bounce buffer
- cache coherency, be flushed first out
dma_addr_t
- streaming DMA mappings, it is recommended by the kernel developers, over coherent DMA mappings whenever possible
void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, int flag);
void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle);
struct dma_pool *dma_pool_create(
const char *name,
struct device *dev,
size_t size,
size_t align,
size_t allocation,
);
void *dma_pool_alloc(struct dma_pool *pool, int mem_flags, dma_addr_t *handle);
void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);
void dma_pool_destroy(struct dma_pool *pool);
- streaming DMA buffer mappings
dma_addr_t dma_map_single(struct device *dev, void *buffer, size_t size, enum dma_data_direction direction);
void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction direction);
void dma_sync_single_for_cpu(struct device *dev, dma_handle_t bus_addr, size_t size, enum dma_data_direction direction);
void dma_sync_single_for_device(struct device *dev, dma_handle_t bus_addr, size_t size, enum dma_data_direction direction);
// single-page streaming mappings
dma_addr_t dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction);
void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, enum dma_data_direction direction);
int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction);
void dma_unmap_sg(struct device *dev, struct scatterlist *list, int nents, enum dma_data_direction direction);
dma_addr_t sg_dma_address(struct scatterlist *sg);
unsigned int sg_dma_len(struct scatterlist *sg);
void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction);
void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction);
$ cat /proc/dma
4: cascade
int request_dma(unsigned int channel, const char *name);
void free_dma(unsigned int channel);
unsigned long claim_dma_lock( );
void release_dma_lock(unsigned long flags);
void set_dma_mode(unsigned int channel, char mode);
void set_dma_addr(unsigned int channel, unsigned int addr);
void set_dma_count(unsigned int channel, unsigned int count);
void disable_dma(unsigned int channel);
void enable_dma(unsigned int channel);
int get_dma_residue(unsigned int channel);
void clear_dma_ff(unsigned int channel);
unsigned long flags;
flags = claim_dma_lock();
disable_dma(channel);
clear_dma_ff(channel);
set_dma_mode(channel, mode);
set_dma_addr(channel, virt_to_bus(buf));
set_dma_count(channel, count);
enable_dma(channel);
release_dma_lock(flags);
int residue;
unsigned long flags = claim_dma_lock ();
residue = get_dma_residue(channel);
release_dma_lock(flags);
// residue == 0
- block device drivers
- different from char devices
- a block is a fixed-size chunk of data, is often 4096 bytes
- a sector, 512-bytes
- block driver registration
int register_blkdev(unsigned int major, const char *name);
int unregister_blkdev(unsigned int major, const char *name);
struct gendisk *alloc_disk(int minors);
void del_gendisk(struct gendisk *gd);
void add_disk(struct gendisk *gd);
$ cat /proc/partitions
major minor #blocks name
1 0 65536 ram0
1 1 65536 ram1
1 2 65536 ram2
1 3 65536 ram3
1 4 65536 ram4
1 5 65536 ram5
1 6 65536 ram6
1 7 65536 ram7
1 8 65536 ram8
1 9 65536 ram9
1 10 65536 ram10
1 11 65536 ram11
1 12 65536 ram12
1 13 65536 ram13
1 14 65536 ram14
1 15 65536 ram15
8 0 372040 sda
8 16 1048580 sdb
8 32 268435456 sdc
static int sbull_open(struct inode *inode, struct file *filp) {
struct sbull_dev *dev = inode->i_bdev->bd_disk->private_data;
del_timer_sync(&dev->timer);
filp->private_data = dev;
spin_lock(&dev->lock);
if (! dev->users) {
check_disk_change(inode->i_bdev);
}
dev->users++;
spin_unlock(&dev->lock);
return 0;
}
static int sbull_release(struct inode *inode, struct file *filp) {
struct sbull_dev *dev = inode->i_bdev->bd_disk->private_data;
spin_lock(&dev->lock);
dev->users--;
if (!dev->users) {
dev->timer.expires = jiffies + INVALIDATE_DELAY;
add_timer(&dev->timer);
}
spin_unlock(&dev->lock);
return 0;
}
int sbull_media_changed(struct gendisk *gd) {
struct sbull_dev *dev = gd->private_data;
return dev->media_change;
}
int sbull_revalidate(struct gendisk *gd) {
struct sbull_dev *dev = gd->private_data;
if (dev->media_change) {
dev->media_change = 0;
memset (dev->data, 0, dev->size);
}
return 0;
}
int sbull_ioctl (struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) {
long size;
struct hd_geometry geo;
struct sbull_dev *dev = filp->private_data;
switch(cmd) {
case HDIO_GETGEO:
/*
* Get geometry: since we are a virtual device, we have to make
* up something plausible. So we claim 16 sectors, four heads,
* and calculate the corresponding number of cylinders. We set the
* start of data at sector four.
*/
size = dev->size*(hardsect_size/KERNEL_SECTOR_SIZE);
geo.cylinders = (size & ~0x3f) >> 6;
geo.heads = 4;
geo.sectors = 16;
geo.start = 4;
if (copy_to_user((void _ _user *) arg, &geo, sizeof(geo))) {
return -EFAULT;
}
return 0;
}
return -ENOTTY; /* unknown command */
}
- request processing
- an example for the simple way
- real world block devices dedicates the performance optimization
void request(request_queue_t *queue);
dev->queue = blk_init_queue(sbull_request, &dev->lock);
void end_request(struct request *req, int succeeded);
static void sbull_request(request_queue_t *q)
{
struct request *req;
while ((req = elv_next_request(q)) != NULL) {
struct sbull_dev *dev = req->rq_disk->private_data;
if (! blk_fs_request(req)) {
printk (KERN_NOTICE "Skip non-fs request\n");
end_request(req, 0);
continue;
}
sbull_transfer(dev, req->sector, req->current_nr_sectors, req->buffer, rq_data_dir(req));
end_request(req, 1);
}
}
static void sbull_transfer(struct sbull_dev *dev, unsigned long sector, unsigned long nsect, char *buffer, int write) {
unsigned long offset = sector*KERNEL_SECTOR_SIZE;
unsigned long nbytes = nsect*KERNEL_SECTOR_SIZE;
if ((offset + nbytes) > dev->size) {
printk (KERN_NOTICE "Beyond-end write (%ld %ld)\n", offset, nbytes);
return;
}
if (write) {
memcpy(dev->data + offset, buffer, nbytes);
} else {
memcpy(buffer, dev->data + offset, nbytes);
}
}
request_queue_t *blk_init_queue(request_fn_proc *request, spinlock_t *lock);
void blk_cleanup_queue(request_queue_t *);
struct request *elv_next_request(request_queue_t *queue);
void blkdev_dequeue_request(struct request *req);
void elv_requeue_request(request_queue_t *queue, struct request *req);
- the request details
- the barrier requests
- written to the drive
REQ_HARDBARRER
- the nonretryable requests
- request completion functions
int end_that_request_first(struct request *req, int success, int count);
void end_that_request_last(struct request *req);
- doing without a request queue
- "no queue" mode
- for flash memory, RAM disks, software RAID arrays, etc...
// The make_request should return 0, regardless of whether the I/O is successful.
typedef int (make_request_fn) (request_queue_t *q, struct bio *bio);
void bio_endio(struct bio *bio, unsigned int bytes, int error);
request_queue_t *blk_alloc_queue(int flags);
void blk_queue_make_request(request_queue_t *queue, make_request_fn *func);
dev->queue = blk_alloc_queue(GFP_KERNEL);
if (dev->queue == NULL) {
goto out_vfree;
}
blk_queue_make_request(dev->queue, sbull_make_request);
- TCQ ... tagged command queueing
$ cat /etc/networks
snullnet0 192.168.0.0
snullnet1 192.168.1.0
$ cat /etc/hosts
192.168.0.1 local0
192.168.0.2 remote0
192.168.1.2 local1
192.168.1.1 remote1
$ ifconfig sn0 local0
$ ifconfig sn1 local1
// size of private data, name is printf style
struct net_device *alloc_netdev(int sizeof_priv, const char *name, void (*setup)(struct net_device *));
struct net_device *alloc_etherdev(int sizeof_priv);
struct net_device *snull_devs[2];
snull_devs[0] = alloc_netdev(sizeof(struct snull_priv), "sn%d", snull_init);
snull_devs[1] = alloc_netdev(sizeof(struct snull_priv), "sn%d", snull_init);
if (snull_devs[0] == NULL || snull_devs[1] == NULL) {
goto out;
}
for (i = 0; i < 2; i++) {
if ((result = register_netdev(snull_devs[i]))) {
printk("snull: error %i registering device \"%s\"\n", result, snull_devs[i]->name);
}
}
// direct access is discouraged
struct snull_priv *priv = netdev_priv(dev);
for (int i = 0; i < 2; i++) {
if (snull_devs[i]) {
unregister_netdev(snull_devs[i]);
snull_teardown_pool(snull_devs[i]);
free_netdev(snull_devs[i]);
}
}
- ARP ... address resolition protocol
IFF_NOARP
- MAC ... Ethernet medium access control addresses
- UTP ... unshielded twisted pair
- MTU ... maximum transfer unit
- SIOCSIFADDR ... Socket I/O Control Set Interface Address
- SIOCSIFFLAGS ... Socket I/O Control Set Interface Flags
void netif_start_queue(struct net_device *dev);
void netif_stop_queue(struct net_device *dev);
void netif_wake_queue(struct net_device *dev);
void netif_tx_disable(struct net_device *dev);
if (skb_shinfo(skb)->nr_frags == 0) {
/* Just use skb->data and skb->len as usual */
}
struct skb_frag_struct {
struct page *page;
__u16 page_offset;
__u16 size;
};
void snull_rx(struct net_device *dev, struct snull_packet *pkt) {
struct sk_buff *skb;
struct snull_priv *priv = netdev_priv(dev);
/*
* The packet has been retrieved from the transmission
* medium. Build an skb around it, so upper layers can handle it
*/
skb = dev_alloc_skb(pkt->datalen + 2);
if (!skb) {
if (printk_ratelimit()) {
printk(KERN_NOTICE "snull rx: low on mem - packet dropped\n");
}
priv->stats.rx_dropped++;
goto out;
}
memcpy(skb_put(skb, pkt->datalen), pkt->data, pkt->datalen);
/* Write metadata, and then pass to the receive level */
skb->dev = dev;
skb->protocol = eth_type_trans(skb, dev);
skb->ip_summed = CHECKSUM_UNNECESSARY; /* don't check it */
priv->stats.rx_packets++;
priv->stats.rx_bytes += pkt->datalen;
netif_rx(skb); /* the return value is discarded */
out:
return;
}
- TTY ... teletypewriter
- in the kernel
- TTY core
- TTY line discipline
- TTY driver
- types
$ cat /proc/tty/drivers
/dev/tty /dev/tty 5 0 system:/dev/tty
/dev/console /dev/console 5 1 system:console
/dev/ptmx /dev/ptmx 5 2 system
/dev/vc/0 /dev/vc/0 4 0 system:vtmaster
usbserial /dev/ttyUSB 188 0-511 serial
acm /dev/ttyACM 166 0-255 serial
hvc /dev/hvc 229 0-7 system
serial /dev/ttyS 4 64-95 serial
pty_slave /dev/pts 136 0-1048575 pty:slave
pty_master /dev/ptm 128 0-1048575 pty:master
unknown /dev/tty 4 1-63 console
$ ls -la /sys/class/tty/ | head
total 0
drwxr-xr-x 2 root root 0 May 18 13:49 .
drwxr-xr-x 34 root root 0 May 18 13:49 ..
lrwxrwxrwx 1 root root 0 May 18 13:49 console -> ../../devices/virtual/tty/console
lrwxrwxrwx 1 root root 0 May 18 13:49 hvc0 -> ../../devices/virtual/tty/hvc0
lrwxrwxrwx 1 root root 0 May 18 13:49 hvc1 -> ../../devices/virtual/tty/hvc1
lrwxrwxrwx 1 root root 0 May 18 13:49 hvc2 -> ../../devices/virtual/tty/hvc2
lrwxrwxrwx 1 root root 0 May 18 13:49 hvc3 -> ../../devices/virtual/tty/hvc3
lrwxrwxrwx 1 root root 0 May 18 13:49 hvc4 -> ../../devices/virtual/tty/hvc4
lrwxrwxrwx 1 root root 0 May 18 13:49 hvc5 -> ../../devices/virtual/tty/hvc5