关于linux 4.12 livepatch 机制的研究
关于内核hotfix已经比较成熟,基本作法就是定义一个新的函数,然后通过kallsyms_lookup_name找到旧函数的地址,通过jmp到新函数地址,然后stop machine后再替换。这新带来的开销比较大,尤其是在有网络负载的情况下,那么今天我们来看一下在4.12内核引入的livepatch机制,它主要是依赖dynamic ftrace。
dynamic ftrace机制
首先,我们先来看一下dynamic ftrace机制。 在内核编译时需要打开CONFIG_DYNAMIC_FTRACE,同时在编译内核时使用-pg选项。这样在就会在每个内核函数的开始处增加一个call mcount代码调用。
void bar(void){
int a = 1;
}
不加pg选项的编译结果
bar:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $1, -4(%rbp)
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
加上pg选项后编译的结果
bar:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
call mcount
movl $1, -4(%rbp)
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
内核代码编译生成.o时,调用位于scripts目录下的recodemcount.pl将每个函数的地址写入:mcount_loc段,那么在内核初始化的时候,ftrace查询到mcount_loc段得到每个函数入口地址,将mcount替换为nop,等打开ftrace时,才会把nop替换成ftrace_caller. 4.12内核livepatch就是用了这个机制,下面我们来具体分析一下
livepatch 相关数据
在linux 内核的sample下面有一个livepatch的sample:livepatch-sample.c。
static struct klp_func funcs[] = {
{
.old_name = "cmdline_proc_show",
.new_func = livepatch_cmdline_proc_show,
}, { }
};
static struct klp_object objs[] = {
{
/* name being NULL means vmlinux */
.funcs = funcs,
}, { }
};
static struct klp_patch patch = {
.mod = THIS_MODULE,
.objs = objs,
};
核心数据结构就是klp_patch这个数据结构,接着往下看
#先注册patch 这个数据结构
ret = klp_register_patch(&patch);
if (ret)
return ret;
#然后enable patch
ret = klp_enable_patch(&patch);
register_patch主要初始化新加的patch为新的object,同时将其加到klp_patches链表的后面。重点看一下klp_enable_patch
int klp_enable_patch(struct klp_patch *patch)
{
int ret;
mutex_lock(&klp_mutex);
if (!klp_is_patch_registered(patch)) {
ret = -EINVAL;
goto err;
}
//核心函数,准备enable patch
ret = __klp_enable_patch(patch);
err:
mutex_unlock(&klp_mutex);
return ret;
}
那么可以看到__klp_enable_patch最终调用了这个函数
static int klp_patch_func(struct klp_func *func)
{
struct klp_ops *ops;
int ret;
if (WARN_ON(!func->old_addr))
return -EINVAL;
if (WARN_ON(func->patched))
return -EINVAL;
//先判断这个函数有没有klp_ops
ops = klp_find_ops(func->old_addr);
if (!ops) {
unsigned long ftrace_loc;
//返回old函数的地址
ftrace_loc = klp_get_ftrace_location(func->old_addr);
if (!ftrace_loc) {
pr_err("failed to find location for function '%s'\n",
func->old_name);
return -EINVAL;
}
//为这个函数初始化klp_ops结构
ops = kzalloc(sizeof(*ops), GFP_KERNEL);
if (!ops)
return -ENOMEM;
//初始化ftrace_ops的func,这个func就是要替换mcount的函数
ops->fops.func = klp_ftrace_handler;
ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
FTRACE_OPS_FL_DYNAMIC |
FTRACE_OPS_FL_IPMODIFY;
list_add(&ops->node, &klp_ops);
INIT_LIST_HEAD(&ops->func_stack);
list_add_rcu(&func->stack_node, &ops->func_stack);
//只替换目标函数,如果没有这一步就会把所有要trace的函数都替换掉
ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);
if (ret) {
pr_err("failed to set ftrace filter for function '%s' (%d)\n",
func->old_name, ret);
goto err;
}
//注册这个ftrace_ops
ret = register_ftrace_function(&ops->fops);
if (ret) {
pr_err("failed to register ftrace handler for function '%s' (%d)\n",
func->old_name, ret);
ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);
goto err;
}
} else {
list_add_rcu(&func->stack_node, &ops->func_stack);
}
func->patched = true;
return 0;
err:
list_del_rcu(&func->stack_node);
list_del(&ops->node);
kfree(ops);
return ret;
}
核心函数就是register_ftrace_function,接下来一步步跟踪,ftrace_startup->__register_ftrace_function.最终调用到这个函数
void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
{
ftrace_func_t func;
unsigned char *new;
unsigned long offset;
unsigned long ip;
unsigned int size;
int ret, npages;
if (ops->trampoline) {
/*
* The ftrace_ops caller may set up its own trampoline.
* In such a case, this code must not modify it.
*/
if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
set_memory_rw(ops->trampoline, npages);
} else {
//创建trampoline
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
}
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
ip = ops->trampoline + offset;
func = ftrace_ops_get_func(ops);
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
set_memory_ro(ops->trampoline, npages);
/* The update should never fail */
WARN_ON(ret);
}
这个函数的主要作用是为ftrace_ops建立trampoline,下面看一下create_trampoline函数
static unsigned long
create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
{
unsigned const char *jmp;
unsigned long start_offset;
unsigned long end_offset;
unsigned long op_offset;
unsigned long offset;
unsigned long size;
unsigned long ip;
unsigned long *ptr;
void *trampoline;
/* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */
unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
union ftrace_op_code_union op_ptr;
int ret;
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
//相关函数详见ftrace_64.s
start_offset = (unsigned long)ftrace_regs_caller;
end_offset = (unsigned long)ftrace_regs_caller_end;
op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_epilogue;
op_offset = (unsigned long)ftrace_caller_op_ptr;
}
//计算ftrace_regs_caller整个函数长度
size = end_offset - start_offset;
/*
* Allocate enough size to store the ftrace_caller code,
* the jmp to ftrace_epilogue, as well as the address of
* the ftrace_ops this trampoline is used for.
*/
//分配空间,MCOUNT_INSN_SIZE 为汇编指令call(0xe8)大小,为5个字节
trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *));
if (!trampoline)
return 0;
*tramp_size = size + MCOUNT_INSN_SIZE + sizeof(void *);
/* Copy ftrace_caller onto the trampoline memory */
//将ftrace_reg_caller copy到 trampoline空间里面
ret = probe_kernel_read(trampoline, (void *)start_offset, size);
if (WARN_ON(ret < 0)) {
tramp_free(trampoline, *tramp_size);
return 0;
}
ip = (unsigned long)trampoline + size;
/* The trampoline ends with a jmp to ftrace_epilogue */
/**
*详解一下ftrace_jmp_replace函数,其实就是生成'jmp $dst_addr'
* static union ftrace_code_union calc;
* calc.e8 = 0xe9; //JMP 指令的机器码
* calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
* 跳转的目标地址 = dst_addr - (src_add + JMP指令长度)
*/
jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue);
memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE); //将 'JMP ftrace_epilogue' copy到 trampoline结尾处
/*
* The address of the ftrace_ops that is used for this trampoline
* is stored at the end of the trampoline. This will be used to
* load the third parameter for the callback. Basically, that
* location at the end of the trampoline takes the place of
* the global function_trace_op variable.
*/
// 将ftrace_ops 作为参数传ftrace callback函数,livepatch的callback函数就是klp_ftrace_handler
ptr = (unsigned long *)(trampoline + size + MCOUNT_INSN_SIZE);
*ptr = (unsigned long)ops;
op_offset -= start_offset;
memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);
/* Are we pointing to the reference? */
if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) {
tramp_free(trampoline, *tramp_size);
return 0;
}
/* Load the contents of ptr into the callback parameter */
offset = (unsigned long)ptr;
offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE;
op_ptr.offset = offset;
/* put in the new offset to the ftrace_ops */
memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
return (unsigned long)trampoline;
}
trampoline创建结束之后,开始执行替换ftrace_call的操作
//offset = ftrace_regs_call - ftrace_regs_caller
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
ip = ops->trampoline + offset;
//func即为 klp_ftrace_handler
func = ftrace_ops_get_func(ops);
/* Do a safe modify in case the trampoline is executing */
//和ftrace_jmp_replace逻辑一样,只是JMP指令换成了CALL
new = ftrace_call_replace(ip, (unsigned long)func);
//更新指令
ret = update_ftrace_func(ip, new);
至此,已经将要hotfix的函数入口处的ftrace_call修改成了klp_ftrace_handler,下面我们看一下这个函数主要做了什么
static void notrace klp_ftrace_handler(unsigned long ip,
unsigned long parent_ip,
struct ftrace_ops *fops,
struct pt_regs *regs)
{
struct klp_ops *ops;
struct klp_func *func;
int patch_state;
//通过ftrace_ops找到klp_ops
ops = container_of(fops, struct klp_ops, fops);
/*
* A variant of synchronize_sched() is used to allow patching functions
* where RCU is not watching, see klp_synchronize_transition().
*/
preempt_disable_notrace();
func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
stack_node);
/*
* func should never be NULL because preemption should be disabled here
* and unregister_ftrace_function() does the equivalent of a
* synchronize_sched() before the func_stack removal.
*/
if (WARN_ON_ONCE(!func))
goto unlock;
/*
* In the enable path, enforce the order of the ops->func_stack and
* func->transition reads. The corresponding write barrier is in
* __klp_enable_patch().
*
* (Note that this barrier technically isn't needed in the disable
* path. In the rare case where klp_update_patch_state() runs before
* this handler, its TIF_PATCH_PENDING read and this func->transition
* read need to be ordered. But klp_update_patch_state() already
* enforces that.)
*/
smp_rmb();
if (unlikely(func->transition)) {
/*
* Enforce the order of the func->transition and
* current->patch_state reads. Otherwise we could read an
* out-of-date task state and pick the wrong function. The
* corresponding write barrier is in klp_init_transition().
*/
smp_rmb();
patch_state = current->patch_state;
WARN_ON_ONCE(patch_state == KLP_UNDEFINED);
if (patch_state == KLP_UNPATCHED) {
/*
* Use the previously patched version of the function.
* If no previous patches exist, continue with the
* original function.
*/
func = list_entry_rcu(func->stack_node.next,
struct klp_func, stack_node);
if (&func->stack_node == &ops->func_stack)
goto unlock;
}
}
//这个函数执行真正的hotfix操作,即替换函数
klp_arch_set_pc(regs, (unsigned long)func->new_func);
unlock:
preempt_enable_notrace();
}
到这里已经把整个核心流程讲完了,个人觉得livepatch好处就是把hotfix框架化,且开销更小,更牛B的地方是说你可以设置这个patch是不是要立即生效(这一点,会在下篇博文中讲到)。当然,它也有一些限制:它需要依赖于dynamic ftrace机制,因此会跟基他同样依赖于此机制的框架相冲突,另外只有能被trace的函数且trace_call在该函数入口处的function才能被hotfix.要想知道哪些函数能使用这个框架进行hotfix也不难,打开ftrace目录看一下current_available_tracer文件里面的函数都是能被hotfix的。