歡迎來到Linux教程網
Linux教程網
Linux教程網
Linux教程網
您现在的位置: Linux教程網 >> UnixLinux >  >> Linux綜合 >> Linux內核

Linux內核調試技術之修改內核定時器來定位系統僵死問題

1.簡介

  在內核調試中,會經常出現內核僵死的問題,也就是發生死循環,內核不能產生調度。導致內核失去響應。這種情況下我們可以采用修改系統內核中的系統時鐘的中斷來定位發生僵死的進程和函數名稱。因為內核系統系統時鐘采用的是硬件中斷的形式存在,所以,軟件發生僵死的時候,系統時鐘照樣會發生中斷。

  1.1、我們在命令行輸入:# cat /proc/interrupts 
# cat /proc/interrupts 
           CPU0
 30:       8316         s3c  S3C2410 Timer Tick -----> 系統時鐘
 33:          0         s3c  s3c-mci
 34:          0         s3c  I2SSDI
 35:          0         s3c  I2SSDO
 37:         12         s3c  s3c-mci
 42:          0         s3c  ohci_hcd:usb1
 43:          0         s3c  s3c2440-i2c
 51:       1047     s3c-ext  eth0
 60:          0     s3c-ext  s3c-mci
 70:         16   s3c-uart0  s3c2440-uart
 71:         26   s3c-uart0  s3c2440-uart
 79:          8     s3c-adc  s3c2410_action
 80:       1732     s3c-adc  s3c2410_action
 83:          0           -  s3c2410-wdt
Err:          0
# 
   30:       8316         s3c  S3C2410 Timer Tick 這個就是系統時鐘,中斷號為30
 1.2、在內核代碼中搜索"S3C2410 Timer Tick"字樣。
  在Time.c (arch\arm\plat-s3c24xx)文件中有如下代碼。
static struct irqaction s3c2410_timer_irq = {
    .name        = "S3C2410 Timer Tick",
    .flags        = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
    .handler    = s3c2410_timer_interrupt,
};

/*
 * IRQ handler for the timer
 */
static irqreturn_t
s3c2410_timer_interrupt(int irq, void *dev_id)
{
#if 1
    static pid_t pre_pid;
    static int cnt=0;
    //時鐘中斷的中斷號是30
    if(irq==30)
    {
        if(pre_pid==current->pid)
        {    
            cnt++;
        }
        else
        {
            cnt=0;    
            pre_pid=current->pid;
        }
     //如果本進程十秒鐘還沒有離開的話,就會打印下面的語句
        if(cnt==10*HZ)
        {
            cnt=0;
            printk("s3c2410_timer_interrupt : pid = %d, task_name = %s\n",current->pid,current->comm);
        }
    }
#endif
    
    write_seqlock(&xtime_lock);
    timer_tick();
    write_sequnlock(&xtime_lock);
    return IRQ_HANDLED;

}

  ①、每個進程都有一個結構task_struct用來存儲進程的一些狀態信息。current是一個宏,表示當前進程的信息,也就是一個task_struct結構體,所以current->pid為當前進程的pid號,current->comm表示當前進程的name。

  ②、HZ也是一個宏定於,表示1s需要多少次中斷。10*HZ表示就就是10s需要多少次中斷!

 2、測試

  編譯內核:#make uImage

  加載一個帶有while(1);的驅動程序,系統發送僵死,系統會打印如下信息:

# insmod first_drv.ko 
# ./firstdrvtest on
s3c2410_timer_interrupt : pid = 770, task_name = firstdrvtest
s3c2410_timer_interrupt : pid = 770, task_name = firstdrvtest

 根據上述信息可知,發送僵死的進程號為:770,發送僵死的進程名稱為:firstdrvtest

3、繼續完善,增加PC值,更加詳細的定位僵死的地方

     我們知道,當中斷發送的時候,在匯編中會調用asm_do_irq函數,

    .macro    irq_handler
    get_irqnr_preamble r5, lr
1:    get_irqnr_and_base r0, r6, r5, lr
    movne    r1, sp
    @
    @ routine called with r0 = irq number, r1 = struct pt_regs *
    @
    adrne    lr, 1b
    bne    asm_do_IRQ #調用C語言的函數
asm_do_IRQ 函數原型:
 
asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
{
    static pid_t pre_pid;
    static int cnt=0;
    
    struct pt_regs *old_regs = set_irq_regs(regs);
    struct irq_desc *desc = irq_desc + irq;

    /*
     * Some hardware gives randomly wrong interrupts.  Rather
     * than crashing, do something sensible.
     */
    if (irq >= NR_IRQS)
        desc = &bad_irq_desc;

    irq_enter();

    desc_handle_irq(irq, desc);

    /* AT91 specific workaround */
    irq_finish(irq);

    irq_exit();
    set_irq_regs(old_regs);

    
} 
  asm_do_IRQ這個函數,在這個函數裡面我們發現了一個結構體:struct pt_regs,這個結構體就用來保存發生中斷時的現場,其中PC值就是:ARM_pc   我們將上面在:s3c2410_timer_interrupt裡面加入的信息都刪除,並在:asm_do_IRQ函數裡面加修改後改函數為:(紅色為添加的程序)  
asmlinkage void __exception asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
{

#if 1
    static pid_t pre_pid;
    static int cnt=0;
    //時鐘中斷的中斷號是30
    if(irq==30)
    {
        if(pre_pid==current->pid)
        {    
            cnt++;
        }
        else
        {
            cnt=0;    
            pre_pid=current->pid;
        }

        if(cnt==10*HZ)
        {
            cnt=0;
            printk("s3c2410_timer_interrupt : pid = %d, task_name = %s\n",current->pid,current->comm);
            printk("pc = %08x\n",regs->ARM_pc);//打印pc值
        }
    }
#endif

    static pid_t pre_pid;
    static int cnt=0;
    
    struct pt_regs *old_regs = set_irq_regs(regs);
    struct irq_desc *desc = irq_desc + irq;

    /*
     * Some hardware gives randomly wrong interrupts.  Rather
     * than crashing, do something sensible.
     */
    if (irq >= NR_IRQS)
        desc = &bad_irq_desc;

    irq_enter();

    desc_handle_irq(irq, desc);

    /* AT91 specific workaround */
    irq_finish(irq);

    irq_exit();
    set_irq_regs(old_regs);

    
}

 4、測試:

# insmod first_drv.ko 
# ./firstdrvtest on
s3c2410_timer_interrupt : pid = 771, task_name = firstdrvtest
pc = bf000084

4.1、查看內核中內核函數、加載的函數的地址

  #cat /proc/kallsyms > /kallsyms.txt 

  找到pc地址為bf000084附近的函數:

....................................
00000000 a first_drv.c    [first_drv]
bf000088 t first_drv_init    [first_drv]
bf000140 t first_drv_exit    [first_drv]
c48761cc ? __mod_license87    [first_drv]
bf000940 b $d    [first_drv]
bf000740 d first_drv_fops    [first_drv]
bf000740 d $d    [first_drv]
bf00003c t first_drv_write    [first_drv]  #大概就在這個函數裡面,可以確定僵死的地方在
bf000000 t first_drv_open    [first_drv]
bf000000 t $a    [first_drv]
bf000038 t $d    [first_drv]
bf00003c t $a    [first_drv]
bf000114 t $d    [first_drv]
bf00094c b firstdrv_class    [first_drv]
bf000950 b firstdrv_class_dev    [first_drv]
bf000140 t $a    [first_drv]
bf000184 t $d    [first_drv]
00000000 a first_drv.mod.c    [first_drv]
c48761d8 ? __module_depends    [first_drv]
bf0008ac d $d    [first_drv]
c4876204 ? __mod_vermagic5    [first_drv]
c01bd44c u class_device_create    [first_drv]
c008ca94 u register_chrdev    [first_drv]
c01bd668 u class_device_unregister    [first_drv]
bf000948 b major    [first_drv]
bf000944 b gpfcon    [first_drv]
c0031ad0 u __iounmap    [first_drv]
c01bc968 u class_create    [first_drv]
bf0007c0 d __this_module    [first_drv]
bf000088 t init_module    [first_drv]
c008c9dc u unregister_chrdev    [first_drv]
bf000140 t cleanup_module    [first_drv]
c01bc9dc u class_destroy    [first_drv]
bf000940 b gpfdat    [first_drv]
c0031a6c u __arm_ioremap    [first_drv]
c0172f80 u __copy_from_user    [first_drv]
c01752e0 u __memzero    [first_drv] 

4.2、查看反匯編

  #arm-linux-objdump -D first_drv.ko > first_drv.dis

  在kallsyms.txt中可以知道,first_drv_write的入口地址為 bf00003c 

  打開first_drv.dis,如何查找真正僵死的位置?   (1)首先從反匯編文件中找到位置為00000000的函數:00000000 <first_drv_open>:   (2)在kallsyms.txt中,first_drv_open 實際位置是:bf000000    (3)根據上面的信息,可知知道,在反匯編中,發送僵死的位置為00000084 - 4  處   (4)查找00000084處代碼在函數:first_drv_write中
0000003c <first_drv_write>:
  3c:    e1a0c00d     mov    ip, sp
  40:    e92dd800     stmdb    sp!, {fp, ip, lr, pc}
  44:    e24cb004     sub    fp, ip, #4    ; 0x4
  48:    e24dd004     sub    sp, sp, #4    ; 0x4
  4c:    e3cd3d7f     bic    r3, sp, #8128 ; 0x1fc0
  50:    e3c3303f     bic    r3, r3, #63   ; 0x3f
  54:    e5933008     ldr    r3, [r3, #8]
  58:    e0910002     adds   r0, r1, r2
  5c:    30d00003     sbcccs    r0, r0, r3
  60:    33a03000     movcc  r3, #0       ; 0x0
  64:    e3530000     cmp    r3, #0       ; 0x0
  68:    e24b0010     sub    r0, fp, #16  ; 0x10
  6c:    1a00001c     bne    e4 <init_module+0x5c>
  70:    ebfffffe     bl     70 <first_drv_write+0x34>
  74:    ea00001f     b      f8 <init_module+0x70>
  78:    e3520000     cmp   r2, #0      ; 0x0
  7c:    11a01002     movne    r1, r2
  80:    1bfffffe     blne    80 <first_drv_write+0x44> #錯誤在這,死循環!!!!
  84:   ea00001f      b 108 <init_module+0x80>

   注意:在arm中,中斷保存的PC是當前指令加4,所以真正僵死的位置是:bf00000080,也就是:80

Copyright © Linux教程網 All Rights Reserved