分类 Linux 相关 下的文章

BPF 例子 - 观测某个 tcp 连接的状态变化

给2个参数, 分别是 IP 和端口, 观察符合找个条件的tcp 连接的状态变化.

这个例子中本想同时对比 IP 和 port, 但是对于IP 遇到一个问题.

常见的 IPv4 是这么写的(字符串): 145.23.45.23
$sk->__sk_common.skc_daddr 拿到的地址 (unsigned int32): 0X91172d17.
使用 ntop 函数转换后是一个 inet_t, 虽然文档死硬说是 Sting 表示形式: inet_t = “145.23.45.23”

那么为了比较2个 IP 地址, 你想把 0X91172d17 转成 inet_t = “145.23.45.23”, 但是还不能比, 不能用 String 和 inet_t 比较.
要么把 145.23.45.23 通过 pton 转成 uint8[], 然后再通过强转 (uint32)pton("145.23.45.23")) 再和 0X91172d17 比较.

关键的关键是 pton 的参数一定要是 字符串常量, 变量不行, 因为它要在编译时知道类型. TMD.

#!/usr/bin/env bpftrace

#ifndef BPFTRACE_HAVE_BTF
#include <linux/socket.h>
#include <net/sock.h>
#else
#include <sys/socket.h>
#endif

BEGIN
{
    @host = "127.0.0.1";
    @port = (uint16)80;
    if ("" != str($1)) {
        @host = str($1);
    }
    if ("" != str($2)) {
        @port = (uint16)$2;
    }

    printf("looking for tcp connection related to : %s:%d\n", @host, @port);
    printf("%39s:%-6s %39s:%-6s %-10s -> %-10s\n", "src IP", "src port", "dest ip", "dest port", "old state", "new state");
    @states[1] = "ESTABLISHED";
    @states[2] = "SYN_SENT";
    @states[3] = "SYN_RECV";
    @states[4] = "FIN_WAIT1";
    @states[5] = "FIN_WAIT2";
    @states[6] = "TIME_WAIT";
    @states[7] = "CLOSE";
    @states[8] = "CLOSE_WAIT";
    @states[9] = "LAST_ACK";
    @states[10] = "LISTEN";
    @states[11] = "CLOSING";
    @states[12] = "NEW_SYN_RECV";
}

kfunc:vmlinux:tcp_set_state {
    $sk = ((struct sock *) args->sk);
    $inet_family = $sk->__sk_common.skc_family;
    if ($inet_family == AF_INET) {
      $daddr = ntop($sk->__sk_common.skc_daddr);
      $saddr = ntop($sk->__sk_common.skc_rcv_saddr);
    } else {
      $daddr = ntop($sk->__sk_common.skc_v6_daddr.in6_u.u6_addr8);
      $saddr = ntop($sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr8);
    }
    $lport = $sk->__sk_common.skc_num;
    $dport = $sk->__sk_common.skc_dport;

    $dport = bswap($dport);

    if (($dport == @port) || ( $lport == @port)) {
        $curState = @states[args.state];
        $key = str($dport);
        $oldState = @keyMap[$daddr, $key];
        if ($oldState == "") {
            $oldState = "NONE";
        }
        @keyMap[$daddr, $key] = $curState;

        printf("%39s:%-6d %39s:%-6d %-10s -> %-10s\n", $saddr, $lport, $daddr, $dport, $oldState, $curState);
    }
}

END {
    clear(@states);
    clear(@keyMap);
    clear(@host);
    clear(@port);
}

container 里面没有 shell

今天本地搭建了一个 etcd 的服务器, 然后接下来想用 etcdctl 去测试一些命令, 于是想去用 container 里面的这个命令, 然后执行 docker exec -it etcd sh, 竟然得到没有这个文件, 然后又试了 bash 也没有.

首先 google 一把, 发现确实有人遇到同样的问题:
https://stackoverflow.com/questions/39900369/how-could-i-get-files-from-a-docker-container-running-the-official-etcd-image-if

按照上面的答案, 我们一个个去试一下:

docker export 导出整个container 文件

$ docker export etcd > /tmp/etcd.tar
$ ls -lah /tmp/etcd.tar
-rw-rw-r-- 1 supra supra 193M Aug 27 23:51 etcd.tar
$ mkdir /tmp/etcd
$ tar xvf /tmp/etcd.tar -C /tmp/etcd
$ ls -lah /tmp/etcd/bin/
total 8.0K
drwxr-xr-x  2 supra supra 4.0K Apr  2 04:55 .
drwxrwxr-x 16 supra supra 4.0K Aug 27 23:53 ..

$ find /tmp/etcd/ -name etcdctl
/tmp/etcd/usr/local/bin/etcdctl

可以看到, /bin 目录啥都没有. 但是我们找到了 etcdctl 在哪里.

docker cp 复制出 /bin 目录

$ mkdir /tmp/tmpbin
$ docker cp etcd:/bin /tmp/tmpbin/
$ ls -lah /tmp/tmpbin/bin/

可以看到内部啥也没有.

Kretprobes

使用Kretprobes 监控内涵函数运行时间和返回值.

一个简单的例子

文件名: myretprobe.c

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/ktime.h>
#include <linux/limits.h>
#include <linux/sched.h>

static char func_name[NAME_MAX] = "ksys_read";
module_param_string(func, func_name, NAME_MAX, S_IRUGO);
MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the"
            " function's execution time");

/* per-instance private data */
struct my_data {
    ktime_t entry_stamp;
};

/* Here we use the entry_hanlder to timestamp function entry */
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
    struct my_data *data;

    if (!current->mm)
        return 1;    /* Skip kernel threads */

    data = (struct my_data *)ri->data;
    data->entry_stamp = ktime_get();
    return 0;
}

/*
 * Return-probe handler: Log the return value and duration. Duration may turn
 * out to be zero consistently, depending upon the granularity of time
 * accounting on the platform.
 */
static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
    int retval = regs_return_value(regs);
    struct my_data *data = (struct my_data *)ri->data;
    s64 delta;
    ktime_t now;

    now = ktime_get();
    delta = ktime_to_ns(ktime_sub(now, data->entry_stamp));
    printk(KERN_INFO "%s returned %d and took %lld ns to execute\n",
            func_name, retval, (long long)delta);
    return 0;
}

static struct kretprobe my_kretprobe = {
    .handler        = ret_handler,
    .entry_handler        = entry_handler,
    .data_size        = sizeof(struct my_data),
    /* Probe up to 20 instances concurrently. */
    .maxactive        = 20,
};

static int __init kretprobe_init(void)
{
    int ret;

    my_kretprobe.kp.symbol_name = func_name;
    ret = register_kretprobe(&my_kretprobe);
    if (ret < 0) {
        printk(KERN_INFO "register_kretprobe failed, returned %d\n",
                ret);
        return -1;
    }
    printk(KERN_INFO "Planted return probe at %s: %p\n",
            my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr);
    return 0;
}

static void __exit kretprobe_exit(void)
{
    unregister_kretprobe(&my_kretprobe);
    printk(KERN_INFO "kretprobe at %p unregistered\n",
            my_kretprobe.kp.addr);

    /* nmissed > 0 suggests that maxactive was set too low. */
    printk(KERN_INFO "Missed probing %d instances of %s\n",
        my_kretprobe.nmissed, my_kretprobe.kp.symbol_name);
}

module_init(kretprobe_init)
module_exit(kretprobe_exit)
MODULE_LICENSE("GPL");

Makefile

obj-m += myretprobe.o

tag ?= `uname -r`
KDIR := /lib/modules/${tag}/build/

all:
    make -C $(KDIR) M=$(PWD) modules

clean:
    make -C $(KDIR) M=$(PWD) clean

运行并查看结果

$ make all
$ sudo insmod myretprobe.ko

$ sudo rmmod myretprobe
$ tail -n 10 /var/log/syslog
Jul 13 00:27:54 supra kernel: [ 2367.855060] ksys_read returned -32 and took 234 ns to execute
Jul 13 00:27:54 supra kernel: [ 2367.855063] ksys_read returned -32 and took 190 ns to execute
Jul 13 00:27:54 supra kernel: [ 2367.855066] ksys_read returned -32 and took 191 ns to execute
Jul 13 00:27:54 supra kernel: [ 2367.855068] ksys_read returned -32 and took 189 ns to execute

统计返回值 histogram 的例子

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/ktime.h>
#include <linux/limits.h>
#include <linux/sched.h>
#include <linux/init.h>

#define MY_ARRAY_SIZE 10
static uint my_array[MY_ARRAY_SIZE];

static char func_name[NAME_MAX] = "ksys_read";
module_param_string(func, func_name, NAME_MAX, S_IRUGO);
MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the"
            " function's execution time");

static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
    int retval = regs_return_value(regs);
    if (retval < 0) {
        my_array[0]++;
    } else if (0 == retval) {
        my_array[1]++;
    } else if (retval < 20) {
        my_array[2]++;
    } else if (retval < 40) {
        my_array[3]++;
    } else if (retval < 80) {
        my_array[4]++;
    } else if (retval < 160) {
        my_array[5]++;
    } else if (retval < 320) {
        my_array[6]++;
    } else if (retval < 640) {
        my_array[7]++;
    } else if (retval < 1280) {
        my_array[8]++;
    } else {
        my_array[9]++;
    }
    //printk(KERN_INFO "%s returned %d \n", func_name, retval);
    return 0;
}

static struct kretprobe my_kretprobe = {
    .handler        = ret_handler,
    .maxactive        = 3,
};

static int __init kretprobe_init(void)
{
    int ret;

    memset(my_array, 0, sizeof(uint) * MY_ARRAY_SIZE);
    my_kretprobe.kp.symbol_name = func_name;
    ret = register_kretprobe(&my_kretprobe);
    if (ret < 0) {
        printk(KERN_INFO "register_kretprobe failed, returned %d\n",
                ret);
        return -1;
    }
    printk(KERN_INFO "Planted return probe at %s: %p\n",
            my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr);
    return 0;
}

static void __exit kretprobe_exit(void)
{
    int i;
    unregister_kretprobe(&my_kretprobe);
    printk(KERN_INFO "kretprobe at %p unregistered\n",
            my_kretprobe.kp.addr);

    printk(KERN_INFO "my_array values:\n");
    for (i = 0; i < MY_ARRAY_SIZE; i++) {
        printk(KERN_INFO "my_array[%d]: %u\n", i, my_array[i]);
    }

    /* nmissed > 0 suggests that maxactive was set too low. */
    printk(KERN_INFO "Missed probing %d instances of %s\n",
        my_kretprobe.nmissed, my_kretprobe.kp.symbol_name);
}

module_init(kretprobe_init)
module_exit(kretprobe_exit)
MODULE_LICENSE("GPL");

使用Kretprobes 观测系统调用 read 字节数 并放到 proc 文件系统

下面使用 Kretprobes 观测系统调用 ksys_read() 的返回字节数, 并把这些数字做成 histogram 的形式放到 /proc/readpattern 去, 然后读这个文件.

源代码

文件名 dumpreadstat.c:

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/ktime.h>
#include <linux/limits.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/proc_fs.h>

#define MY_ARRAY_SIZE 10
#define MAX_STRING_LENGTH 16
static uint my_array[MY_ARRAY_SIZE];
static char desc_arr[MY_ARRAY_SIZE][MAX_STRING_LENGTH] = {
    "< 0        ",
    "= 0        ",
    "0 -> 20    ",
    "20 -> 40   ",
    "40 -> 80   ",
    "80 -> 160  ",
    "160 -> 320 ",
    "320 -> 640 ",
    "640 -> 1280",
    " > 1280    "
};

static int flag = 1;
static struct proc_dir_entry *proc_file;

static char func_name[NAME_MAX] = "ksys_read";
module_param_string(func, func_name, NAME_MAX, S_IRUGO);
MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the"
            " function's execution time");

static int open_proc(struct inode *inode, struct file *file)
{
    printk(KERN_ALERT "open proc\n");
    return 0;
}

static int release_proc(struct inode *inode, struct file *file)
{
    printk(KERN_ALERT "release proc\n");
    return 0;
}

static ssize_t read_proc(struct file *filp, char __user *buffer, size_t length, loff_t *offset)
{
    int i;
    printk(KERN_ALERT "read proc\n");
    if (flag)
    {
        flag = 0;
    }
    else
    {
        flag = 1;
        return 0;
    }

    char output[1024];
    int offst = 15;
    int remaining = sizeof(output);
    snprintf(output, remaining, "bytes \t\t:count\n");
    remaining -= 15;
    for (i = 0; i < MY_ARRAY_SIZE; i++) {
        int ret;

        ret = snprintf(output + offst, remaining, "%s\t: %u\n", desc_arr[i], my_array[i]);
        if (ret < 0 || ret >= remaining) {
            printk(KERN_ERR "Failed to concatenate my_array values\n");
            return -EINVAL;
        }

        offst += ret;
        remaining -= ret;
    }

    printk(KERN_ALERT "%s", output);

    if (copy_to_user(buffer, output, offst))
    {
        printk(KERN_ERR "Data Send: Err!\n");
        return -EFAULT;
    }
    return strlen(output);
}

static ssize_t write_proc(struct file *filp, const char *buffer, size_t len, loff_t *off)
{
    printk(KERN_ALERT "write proc\n");
    return 0;
}

static const struct proc_ops proc_fops = {
    .proc_open = open_proc,
    .proc_read = read_proc,
    .proc_write = write_proc,
    .proc_release = release_proc,
};

static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
    int retval = regs_return_value(regs);
    if (retval < 0) {
        my_array[0]++;
    } else if (0 == retval) {
        my_array[1]++;
    } else if (retval < 20) {
        my_array[2]++;
    } else if (retval < 40) {
        my_array[3]++;
    } else if (retval < 80) {
        my_array[4]++;
    } else if (retval < 160) {
        my_array[5]++;
    } else if (retval < 320) {
        my_array[6]++;
    } else if (retval < 640) {
        my_array[7]++;
    } else if (retval < 1280) {
        my_array[8]++;
    } else {
        my_array[9]++;
    }
    //printk(KERN_INFO "%s returned %d \n", func_name, retval);
    return 0;
}

static struct kretprobe my_kretprobe = {
    .handler        = ret_handler,
    .maxactive        = 3,
};

static int __init kretprobe_init(void)
{
    int ret;

    memset(my_array, 0, sizeof(uint) * MY_ARRAY_SIZE);
    my_kretprobe.kp.symbol_name = func_name;
    ret = register_kretprobe(&my_kretprobe);
    if (ret < 0) {
        printk(KERN_INFO "register_kretprobe failed, returned %d\n",
                ret);
        return -1;
    }
    printk(KERN_INFO "Planted return probe at %s: %p\n",
            my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr);

    /* Create proc file under /proc/dumpprocmm */
    proc_file = proc_create("readpattern", 0666, NULL, &proc_fops);
    if (!proc_file) {
        printk(KERN_ERR "Failed to create proc file\n");
        return -ENOMEM;
    }

    return 0;
}

static void __exit kretprobe_exit(void)
{
    int i;
    unregister_kretprobe(&my_kretprobe);
    printk(KERN_INFO "kretprobe at %p unregistered\n",
            my_kretprobe.kp.addr);

    printk(KERN_INFO "my_array values:\n");
    for (i = 0; i < MY_ARRAY_SIZE; i++) {
        printk(KERN_INFO "my_array[%d]: %u\n", i, my_array[i]);
    }

    if (proc_file) {
        proc_remove(proc_file);
        printk(KERN_INFO "Removed /proc/%s file\n", "readpattern");
    }

    /* nmissed > 0 suggests that maxactive was set too low. */
    printk(KERN_INFO "Missed probing %d instances of %s\n",
        my_kretprobe.nmissed, my_kretprobe.kp.symbol_name);
}

module_init(kretprobe_init)
module_exit(kretprobe_exit)
MODULE_LICENSE("GPL");

Makefile

obj-m += dumpreadstat.o

tag ?= `uname -r`
KDIR := /lib/modules/${tag}/build/

all:
    make -C $(KDIR) M=$(PWD) modules

clean:
    make -C $(KDIR) M=$(PWD) clean

执行并观测

$ make all
$ sudo insmod dumpreadstat.ko
$ cat /proc/readpattern
bytes         :count
< 0            : 1
= 0            : 27
0 -> 20        : 65
20 -> 40       : 24
40 -> 80       : 7
80 -> 160      : 8
160 -> 320     : 3
320 -> 640     : 2
640 -> 1280    : 11
 > 1280        : 10

# 卸载模块
$ sudo rmmod dumpreadstat

# 观测系统日志
$ tail -n 100 /var/log/syslog

Kprobes

Kprobes 允许开发者在内核函数的开始,结束,及任意偏移位置插入代码, 监视内核函数的执行, 并收集参数, 返回值,及运行时间等数据.

概念

  1. 有2种: kprobes, kretprobes
  2. 通常使用内核模块来注册 Kprobes, 在模块的 init 代码注册action handler, exit 代码注销;
  3. register_kprobe 注册在那个内核函数位置注入, 以及要注入的代码块;
  4. unregister_kprobe 用来注销;
  5. 可以批量注册/注销 Kprobes;
  6. 有些特定的内核函数属于 blacklist, 是不允许插入代码的;

    1. 可以 probe 的函数列表 /sys/kernel/tracing/available_filter_functions;
    2. 不可以 probe 的: inline functions & /sys/kernel/debug/kprobes/blacklist

简单例子

一个简单的kernel 模块来注册/注销 Kprobes. 传入不同的参数, 可以注入到不同的kernel 代码位置.
文件名 mykprobe.c

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>

#define MAX_SYMBOL_LEN    64
static char symbol[MAX_SYMBOL_LEN] = "vfs_write";
module_param_string(symbol, symbol, sizeof(symbol), 0644);

/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
    .symbol_name    = symbol,
};

#if defined(CONFIG_X86_64)
#define arg0(pt_regs)    ((pt_regs)->di)
#define arg1(pt_regs)    ((pt_regs)->si)
#define arg2(pt_regs)    ((pt_regs)->dx)
#define arg3(pt_regs)    ((pt_regs)->cx)
#define arg4(pt_regs)    ((pt_regs)->r8)
#define arg5(pt_regs)    ((pt_regs)->r9)
#elif defined(CONFIG_ARM64)
#define arg0(pt_regs)    ((pt_regs)->regs[0])
#define arg1(pt_regs)    ((pt_regs)->regs[1])
#define arg2(pt_regs)    ((pt_regs)->regs[2])
#define arg3(pt_regs)    ((pt_regs)->regs[3])
#define arg4(pt_regs)    ((pt_regs)->regs[4])
#define arg5(pt_regs)    ((pt_regs)->regs[5])
#define arg6(pt_regs)    ((pt_regs)->regs[6])
#define arg7(pt_regs)    ((pt_regs)->regs[7])
#else
#error "Unsupported architecture"
#endif


/* kprobe pre_handler: called just before the probed instruction is executed */
static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs)
{
    pr_info("<%s> p->addr = 0x%p, ip = %lx, flags = 0x%lx, count = %lu \n",
        p->symbol_name, p->addr, regs->ip, regs->flags, arg2(regs));

    /* A dump_stack() here will give a stack backtrace */
    return 0;
}

/* kprobe post_handler: called after the probed instruction is executed */
static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs,
                unsigned long flags)
{
    pr_info("<%s> p->addr = 0x%p, flags = 0x%lx\n",
        p->symbol_name, p->addr, regs->flags);
}

static int __init kprobe_init(void)
{
    int ret;
    kp.pre_handler = handler_pre;
    kp.post_handler = handler_post;

    ret = register_kprobe(&kp);
    if (ret < 0) {
        pr_err("register_kprobe failed, returned %d\n", ret);
        return ret;
    }
    pr_info("Planted kprobe at %p\n", kp.addr);
    return 0;
}

static void __exit kprobe_exit(void)
{
    unregister_kprobe(&kp);
    pr_info("kprobe at %p unregistered\n", kp.addr);
}

module_init(kprobe_init)
module_exit(kprobe_exit)
MODULE_LICENSE("GPL");

同目录的 Makefile 文件:

obj-m += kprobe.o

tag ?= `uname -r`
KDIR := /lib/modules/${tag}/build/

all:
    make -C $(KDIR) M=$(PWD) modules

clean:
    make -C $(KDIR) M=$(PWD) clean

编译并加载模块

$ make all
$ sudo insmod cprobe.ko

查看结果

$ tail -f -n 10 /var/log/syslog
Jul 12 11:46:18 supra kernel: [64756.049707] <vfs_write> p->addr = 0x000000007d821bae, flags = 0x293
Jul 12 11:46:18 supra kernel: [64756.049736] <vfs_write> p->addr = 0x000000007d821bae, ip = ffffffffafd7b621, flags = 0x293, count = 478
````

### 卸载模块

$ sudo rmmod kprobe


# kprobe 结构体定义
参见: https://github.com/torvalds/linux/blob/master/include/linux/kprobes.h#L60

struct kprobe {

struct hlist_node hlist;
struct list_head list;
unsigned long nmissed;
kprobe_opcode_t *addr;
const char *symbol_name;
unsigned int offset;
kprobe_pre_handler_t pre_handler;
kprobe_post_handler_t post_handler;
kprobe_opcode_t opcode;
struct arch_specific_insn ainsn;
u32 flags;

};


# 定义插入位置
1. 通过 `symbol_name`;
2. 通过 `addr`:
  要么通过 `symbol_name` 要么通过 `addr`. 上面的例子中使用 `symbol_name`, 如果要替换成 `addr`, 方法如下:
  1. 通过查找 `/proc/kallsyms` 定位地址:
cat /proc/kallsyms | grep vfs_write
ffffffffafd7b620 T vfs_write
```
  1. 替换结构体 .addr = ffffffffafd7b620
  1. 通过 (symbol_name | addr) + offset

    .addr = ffffffffafd7b620,
    .offset = 5,

how it works

如下图, 把指定位置处的指令替换成 trap(0x03), 然后引导到新构建的代码块, 里面包含 pre_handler, 原指令,post_handler.
trap.png

其它