学习和研究中前行,并在分享中提升自己

欢迎订阅阿里内推邮件



关于linux 4.12 livepatch 机制的研究

阅读次数: 462| 时间:2017年12月24日 17:22 | 标签:linux

关于内核hotfix已经比较成熟,基本作法就是定义一个新的函数,然后通过kallsyms_lookup_name找到旧函数的地址,通过jmp到新函数地址,然后stop machine后再替换。这新带来的开销比较大,尤其是在有网络负载的情况下,那么今天我们来看一下在4.12内核引入的livepatch机制,它主要是依赖dynamic ftrace。

dynamic ftrace机制

首先,我们先来看一下dynamic ftrace机制。 在内核编译时需要打开CONFIG_DYNAMIC_FTRACE,同时在编译内核时使用-pg选项。这样在就会在每个内核函数的开始处增加一个call mcount代码调用。

void bar(void){
  int a = 1;
}

不加pg选项的编译结果

 bar:
.LFB0:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movl    $1, -4(%rbp)
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc   

加上pg选项后编译的结果

bar:
.LFB0:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    subq    $16, %rsp
    call    mcount
    movl    $1, -4(%rbp)
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc

内核代码编译生成.o时,调用位于scripts目录下的recodemcount.pl将每个函数的地址写入:mcount_loc段,那么在内核初始化的时候,ftrace查询到mcount_loc段得到每个函数入口地址,将mcount替换为nop,等打开ftrace时,才会把nop替换成ftrace_caller. 4.12内核livepatch就是用了这个机制,下面我们来具体分析一下

livepatch 相关数据

在linux 内核的sample下面有一个livepatch的sample:livepatch-sample.c。

static struct klp_func funcs[] = {                                                                                                                                                
    {                                                                           
        .old_name = "cmdline_proc_show",                                        
        .new_func = livepatch_cmdline_proc_show,                                
    }, { }                                                                      
};                                                                              

static struct klp_object objs[] = {                                             
    {                                                                           
        /* name being NULL means vmlinux */                                     
        .funcs = funcs,                                                         
    }, { }                                                                      
};                                                                              

static struct klp_patch patch = {                                               
    .mod = THIS_MODULE,                                                         
    .objs = objs,                                                               
};          

核心数据结构就是klp_patch这个数据结构,接着往下看

#先注册patch 这个数据结构
ret = klp_register_patch(&patch);                                       
if (ret)                                                                
    return ret; 
#然后enable patch                                                        
ret = klp_enable_patch(&patch);  

register_patch主要初始化新加的patch为新的object,同时将其加到klp_patches链表的后面。重点看一下klp_enable_patch

int klp_enable_patch(struct klp_patch *patch)                                                                                                                                     
{                                                                               
    int ret;                                                                    

    mutex_lock(&klp_mutex);                                                     

    if (!klp_is_patch_registered(patch)) {                                      
        ret = -EINVAL;                                                          
        goto err;                                                               
    }                                                                           
    //核心函数,准备enable patch                                                                            
    ret = __klp_enable_patch(patch);                                            

err:                                                                            
    mutex_unlock(&klp_mutex);                                                   
    return ret;                                                                 
}                       

那么可以看到__klp_enable_patch最终调用了这个函数

static int klp_patch_func(struct klp_func *func)                                
{                                                                               
    struct klp_ops *ops;                                                        
    int ret;                                                                    

    if (WARN_ON(!func->old_addr))                                               
        return -EINVAL;                                                         

    if (WARN_ON(func->patched))                                                 
        return -EINVAL;                                                         
    //先判断这个函数有没有klp_ops                                                                            
    ops = klp_find_ops(func->old_addr);                                         
    if (!ops) {                                                                 
        unsigned long ftrace_loc;                                               
        //返回old函数的地址                                                                       
        ftrace_loc = klp_get_ftrace_location(func->old_addr);                   
        if (!ftrace_loc) {                                                      
            pr_err("failed to find location for function '%s'\n",               
                func->old_name);                                                
            return -EINVAL;                                                     
        }                                                                       
        //为这个函数初始化klp_ops结构                                                                        
        ops = kzalloc(sizeof(*ops), GFP_KERNEL);                                
        if (!ops)                                                               
            return -ENOMEM;                                                     
        //初始化ftrace_ops的func,这个func就是要替换mcount的函数                                                                        
        ops->fops.func = klp_ftrace_handler;                                    
        ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |                             
                  FTRACE_OPS_FL_DYNAMIC |                                       
                  FTRACE_OPS_FL_IPMODIFY;                                       

        list_add(&ops->node, &klp_ops);                                         

        INIT_LIST_HEAD(&ops->func_stack);                                       
        list_add_rcu(&func->stack_node, &ops->func_stack);                      
         //只替换目标函数,如果没有这一步就会把所有要trace的函数都替换掉                                                                       
        ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);               
        if (ret) {                                                              
            pr_err("failed to set ftrace filter for function '%s' (%d)\n",      
                   func->old_name, ret);                                        
            goto err;                                                           
        }                                                                       
         //注册这个ftrace_ops                                                                       
        ret = register_ftrace_function(&ops->fops);                             
        if (ret) {                                                              
            pr_err("failed to register ftrace handler for function '%s' (%d)\n",
                   func->old_name, ret);                                        
            ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);                 
            goto err;                                                           
      }                            
} else {                                                                
        list_add_rcu(&func->stack_node, &ops->func_stack);                  
    }                                                                       

    func->patched = true;                                                   

    return 0;                                                               

err:                                                                        
    list_del_rcu(&func->stack_node);                                        
    list_del(&ops->node);                                                   
    kfree(ops);                                                             
    return ret;                                                             
}  

核心函数就是register_ftrace_function,接下来一步步跟踪,ftrace_startup->__register_ftrace_function.最终调用到这个函数

void arch_ftrace_update_trampoline(struct ftrace_ops *ops)                          
{                                                                                   
    ftrace_func_t func;                                                             
    unsigned char *new;                                                             
    unsigned long offset;                                                           
    unsigned long ip;                                                               
    unsigned int size;                                                              
    int ret, npages;                                                                

    if (ops->trampoline) {                                                          
        /*                                                                          
         * The ftrace_ops caller may set up its own trampoline.                     
         * In such a case, this code must not modify it.                            
         */                                                                         
        if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))                              
            return;                                                                 
        npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;                    
        set_memory_rw(ops->trampoline, npages);                                     
    } else {   
        //创建trampoline                                                                     
        ops->trampoline = create_trampoline(ops, &size);                                                                                                                          
        if (!ops->trampoline)                                                       
            return;                                                                 
        ops->trampoline_size = size;                                                
        npages = PAGE_ALIGN(size) >> PAGE_SHIFT;                                    
    }                                                                               

    offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); 
    ip = ops->trampoline + offset;                                                  

    func = ftrace_ops_get_func(ops);                                                

    /* Do a safe modify in case the trampoline is executing */                      
    new = ftrace_call_replace(ip, (unsigned long)func);                             
    ret = update_ftrace_func(ip, new);                                              
    set_memory_ro(ops->trampoline, npages);                                         

    /* The update should never fail */                                              
    WARN_ON(ret);                                                                   
}                                                              

这个函数的主要作用是为ftrace_ops建立trampoline,下面看一下create_trampoline函数

static unsigned long                                                            
create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)             
{                                                                               
    unsigned const char *jmp;                                                   
    unsigned long start_offset;                                                 
    unsigned long end_offset;                                                   
    unsigned long op_offset;                                                    
    unsigned long offset;                                                       
    unsigned long size;                                                         
    unsigned long ip;                                                           
    unsigned long *ptr;                                                         
    void *trampoline;                                                           
    /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */                        
    unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };                        
    union ftrace_op_code_union op_ptr;                                          
    int ret;                                                                    

    if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {  
        //相关函数详见ftrace_64.s                               
        start_offset = (unsigned long)ftrace_regs_caller;                       
        end_offset = (unsigned long)ftrace_regs_caller_end;                     
        op_offset = (unsigned long)ftrace_regs_caller_op_ptr;                   
    } else {                                                                    
        start_offset = (unsigned long)ftrace_caller;                            
        end_offset = (unsigned long)ftrace_epilogue;                            
        op_offset = (unsigned long)ftrace_caller_op_ptr;                        
    }                                                                           
    //计算ftrace_regs_caller整个函数长度                                                                           
    size = end_offset - start_offset;                                           

    /*                                                                          
     * Allocate enough size to store the ftrace_caller code,                    
     * the jmp to ftrace_epilogue, as well as the address of                    
     * the ftrace_ops this trampoline is used for.                              
     */ 
     //分配空间,MCOUNT_INSN_SIZE 为汇编指令call(0xe8)大小,为5个字节                                                                        
    trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *));         
    if (!trampoline)                                                            
        return 0;                                                               

    *tramp_size = size + MCOUNT_INSN_SIZE + sizeof(void *);                     

    /* Copy ftrace_caller onto the trampoline memory */  
    //将ftrace_reg_caller copy到 trampoline空间里面            
    ret = probe_kernel_read(trampoline, (void *)start_offset, size);            
    if (WARN_ON(ret < 0)) {                                                     
        tramp_free(trampoline, *tramp_size);                                    
        return 0;                                                               
    }                                

    ip = (unsigned long)trampoline + size;                                      

    /* The trampoline ends with a jmp to ftrace_epilogue */ 
    /**
    *详解一下ftrace_jmp_replace函数,其实就是生成'jmp $dst_addr'
    * static union ftrace_code_union calc;                             
    * calc.e8     = 0xe9;   //JMP 指令的机器码                                                  
    * calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);                                                             
    * 跳转的目标地址 = dst_addr - (src_add + JMP指令长度)
    */                    
    jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue);               
    memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE); //将 'JMP ftrace_epilogue' copy到 trampoline结尾处                       

    /*                                                                          
     * The address of the ftrace_ops that is used for this trampoline           
     * is stored at the end of the trampoline. This will be used to             
     * load the third parameter for the callback. Basically, that               
     * location at the end of the trampoline takes the place of                 
     * the global function_trace_op variable.                                   
     */                                                                         
    // 将ftrace_ops 作为参数传ftrace callback函数,livepatch的callback函数就是klp_ftrace_handler

    ptr = (unsigned long *)(trampoline + size + MCOUNT_INSN_SIZE);              
    *ptr = (unsigned long)ops;                                                  

    op_offset -= start_offset;                                                  
    memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);                       

    /* Are we pointing to the reference? */                                     
    if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) {                           
        tramp_free(trampoline, *tramp_size);                                    
        return 0;                                                               
    }                                                                           

    /* Load the contents of ptr into the callback parameter */                  
    offset = (unsigned long)ptr;                                                
    offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE;              

    op_ptr.offset = offset;                                                     

    /* put in the new offset to the ftrace_ops */                               
    memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);                       

    /* ALLOC_TRAMP flags lets us know we created it */                          
    ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;                                    

    return (unsigned long)trampoline;                                           
}                                                 

trampoline创建结束之后,开始执行替换ftrace_call的操作

    //offset = ftrace_regs_call - ftrace_regs_caller
    offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); 
    ip = ops->trampoline + offset;                                              
     //func即为 klp_ftrace_handler                                                                          
    func = ftrace_ops_get_func(ops);                                            

    /* Do a safe modify in case the trampoline is executing */ 
    //和ftrace_jmp_replace逻辑一样,只是JMP指令换成了CALL                 
    new = ftrace_call_replace(ip, (unsigned long)func); 
    //更新指令                        
    ret = update_ftrace_func(ip, new);                  

至此,已经将要hotfix的函数入口处的ftrace_call修改成了klp_ftrace_handler,下面我们看一下这个函数主要做了什么

static void notrace klp_ftrace_handler(unsigned long ip,                        
                       unsigned long parent_ip,                                 
                       struct ftrace_ops *fops,                                 
                       struct pt_regs *regs)                                    
{                                                                               
    struct klp_ops *ops;                                                        
    struct klp_func *func;                                                      
    int patch_state;                                                            
    //通过ftrace_ops找到klp_ops                                                                            
    ops = container_of(fops, struct klp_ops, fops);                             

    /*                                                                          
     * A variant of synchronize_sched() is used to allow patching functions     
     * where RCU is not watching, see klp_synchronize_transition().             
     */                                                                         
    preempt_disable_notrace();                                                  

    func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,            
                      stack_node);                                              

    /*                                                                          
     * func should never be NULL because preemption should be disabled here     
     * and unregister_ftrace_function() does the equivalent of a                
     * synchronize_sched() before the func_stack removal.                       
     */                                                                         
    if (WARN_ON_ONCE(!func))                                                    
        goto unlock;                                                            

    /*                                                                          
     * In the enable path, enforce the order of the ops->func_stack and         
     * func->transition reads.  The corresponding write barrier is in           
     * __klp_enable_patch().                                                    
     *                                                                          
     * (Note that this barrier technically isn't needed in the disable          
     * path.  In the rare case where klp_update_patch_state() runs before       
     * this handler, its TIF_PATCH_PENDING read and this func->transition       
     * read need to be ordered.  But klp_update_patch_state() already           
     * enforces that.)                                                          
     */                                                                         
    smp_rmb();  
    if (unlikely(func->transition)) {                                           

        /*                                                                      
         * Enforce the order of the func->transition and                        
         * current->patch_state reads.  Otherwise we could read an              
         * out-of-date task state and pick the wrong function.  The             
         * corresponding write barrier is in klp_init_transition().             
         */                                                                     
        smp_rmb();                                                              

        patch_state = current->patch_state;                                     

        WARN_ON_ONCE(patch_state == KLP_UNDEFINED);                             

        if (patch_state == KLP_UNPATCHED) {                                     
            /*                                                                  
             * Use the previously patched version of the function.              
             * If no previous patches exist, continue with the                  
             * original function.                                               
             */                                                                 
            func = list_entry_rcu(func->stack_node.next,                        
                          struct klp_func, stack_node);                         

            if (&func->stack_node == &ops->func_stack)                          
                goto unlock;                                                    
        }                                                                       
    }                                                                           
    //这个函数执行真正的hotfix操作,即替换函数                                                                          
    klp_arch_set_pc(regs, (unsigned long)func->new_func);                       
unlock:                                                                         
    preempt_enable_notrace();                                                   
}         

到这里已经把整个核心流程讲完了,个人觉得livepatch好处就是把hotfix框架化,且开销更小,更牛B的地方是说你可以设置这个patch是不是要立即生效(这一点,会在下篇博文中讲到)。当然,它也有一些限制:它需要依赖于dynamic ftrace机制,因此会跟基他同样依赖于此机制的框架相冲突,另外只有能被trace的函数且trace_call在该函数入口处的function才能被hotfix.要想知道哪些函数能使用这个框架进行hotfix也不难,打开ftrace目录看一下current_available_tracer文件里面的函数都是能被hotfix的。