linux将程序锁死,Linux死锁现象及分析方法(转)
本節我們對Linux死鎖做一個基本的介紹,然后講解如何檢測并定位死鎖。
1. 什么是死鎖?
死鎖(DeadLock)是指兩個或者兩個以上的進程(線程)在執行過程中,因爭奪資源而造成的一種互相等待的現象,若無外力作用,它們都將無法推進下去。此時稱系統處于死鎖狀態或系統產生了死鎖,這些永遠在互相等待的進程(線程)稱為死鎖進程(線程)。由于資源占用是互斥的,當某個進程提出申請后,使得有關進程(線程)在無外力協助下,永遠分配不到必需的資源而無法繼續進行,這就產生了一種特殊現象——死鎖。
一種交叉持鎖死鎖的情形,此時執行程序中兩個或多個線程發生永久堵塞(等待),每個線程都在等待被其他線程占用并堵塞了的資源。例如,如果線程1鎖住了記錄A并等待記錄B,而線程2鎖住了記錄B并等待記錄A,這樣兩個線程就發生了死鎖現象。在計算機系統中,如果系統的資源分配策略不當,更常見的可能是程序員寫的程序有錯誤等,則會導致進程因競爭資源不當而產生死鎖的現象。
2. 產生死鎖的四個必要條件
1) 對臨界資源的互斥使用(資源獨占)
一個資源每次只能給一個進程(線程)使用。比如寫操作
2) 占有且等待
進程在申請新的資源的同時,保持對原有資源的占有。
3) 不可搶占
資源申請者不能強行從資源占有者手中奪取資源,資源只能由占有者自愿釋放。
4) 循環等待
P1等待P2占有的資源,P2等待P3占有的資源, … Pn等待P1占有的資源,形成一個進程等待回路。
3. 一個例子及圖示
3.1 圖示
進程在執行一些代碼之后,子線程1獲得了鎖A,正試圖獲得鎖B;子線程2此時獲得鎖B,正試圖獲得鎖A,這樣子線程1和子線程2將沒辦法同時獲得鎖A和鎖B,因為它們各自被對方占有,永遠不會釋放,從而發生死鎖現象。
3.2 示例代碼
參看如下示例代碼(deadlock.c):
#include
#include
#include
pthread_mutex_t mutexA = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutexB = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_t mutexC = PTHREAD_MUTEX_INITIALIZER;
static int counterA = 0;
static int counterB = 0;
int func1()
{
pthread_mutex_lock(&mutexA);
++counterA;
sleep(1);
pthread_mutex_lock(&mutexB);
++counterB;
pthread_mutex_unlock(&mutexB);
pthread_mutex_unlock(&mutexA);
return counterA;
}
int func2()
{
pthread_mutex_lock(&mutexB);
++counterB;
sleep(1);
pthread_mutex_lock(&mutexA);
++counterA;
pthread_mutex_unlock(&mutexA);
pthread_mutex_unlock(&mutexB);
return counterB;
}
void* start_routine1(void* arg)
{
while (1)
{
int iRetValue = func1();
if (iRetValue == 100000)
{
pthread_exit(NULL);
}
}
}
void* start_routine2(void* arg)
{
while (1)
{
int iRetValue = func2();
if (iRetValue == 100000)
{
pthread_exit(NULL);
}
}
}
void* start_routine(void* arg)
{
while (1)
{
sleep(1);
char szBuf[128];
memset(szBuf, 0, sizeof(szBuf));
strcpy(szBuf, (char*)arg);
}
}
int main()
{
pthread_t tid[4];
if (pthread_create(&tid[0], NULL, &start_routine1, NULL) != 0)
{
_exit(1);
}
if (pthread_create(&tid[1], NULL, &start_routine2, NULL) != 0)
{
_exit(1);
}
if (pthread_create(&tid[2], NULL, &start_routine, "thread3") != 0)
{
_exit(1);
}
if (pthread_create(&tid[3], NULL, &start_routine, "thread3") != 0)
{
_exit(1);
}
sleep(5);
//pthread_cancel(tid[0]);
pthread_join(tid[0], NULL);
pthread_join(tid[1], NULL);
pthread_join(tid[2], NULL);
pthread_join(tid[3], NULL);
pthread_mutex_destroy(&mutexA);
pthread_mutex_destroy(&mutexB);
pthread_mutex_destroy(&mutexC);
return 0;
}
3.3 編譯程序并執行
執行如下命令編譯程序并執行:
# gcc -g -o deadlock deadlock.c -lpthread
# ./deadlock
# ps -ef | grep deadlock
root 20495 6820 0 13:50 pts/0 00:00:00 ./deadlock
4. 使用pstack和gdb工具對死鎖程序進行分析
4.1 pstack
pstack命令可顯示每個進程的棧跟蹤,pstack $pid即可,pstack命令須由$pid進程的屬主或者root運行。下面第一次執行:
# pstack 20495
Thread 5 (Thread 0x7f0d2359f700 (LWP 20496)):
#0 0x00007f0d2396f1bd in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f0d2396ad02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f0d2396ac08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000400917 in func1 () at deadlock.c:18
#4 0x00000000004009b4 in start_routine1 (arg=0x0) at deadlock.c:43
#5 0x00007f0d23968dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f0d2369776d in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f0d22d9e700 (LWP 20497)):
#0 0x00007f0d2396f1bd in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f0d2396ad02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f0d2396ac08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000400973 in func2 () at deadlock.c:31
#4 0x00000000004009e2 in start_routine2 (arg=0x0) at deadlock.c:56
#5 0x00007f0d23968dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f0d2369776d in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f0d2259d700 (LWP 20498)):
#0 0x00007f0d2365e66d in nanosleep () from /lib64/libc.so.6
#1 0x00007f0d2365e504 in sleep () from /lib64/libc.so.6
#2 0x0000000000400a16 in start_routine (arg=0x400bf4) at deadlock.c:69
#3 0x00007f0d23968dc5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f0d2369776d in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f0d21d9c700 (LWP 20499)):
#0 0x00007f0d2365e66d in nanosleep () from /lib64/libc.so.6
#1 0x00007f0d2365e504 in sleep () from /lib64/libc.so.6
#2 0x0000000000400a16 in start_routine (arg=0x400bf4) at deadlock.c:69
#3 0x00007f0d23968dc5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f0d2369776d in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f0d23d80740 (LWP 20495)):
#0 0x00007f0d23969ef7 in pthread_join () from /lib64/libpthread.so.0
#1 0x0000000000400b17 in main () at deadlock.c:99
第二次執行:
# pstack 20495
Thread 5 (Thread 0x7f0d2359f700 (LWP 20496)):
#0 0x00007f0d2396f1bd in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f0d2396ad02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f0d2396ac08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000400917 in func1 () at deadlock.c:18
#4 0x00000000004009b4 in start_routine1 (arg=0x0) at deadlock.c:43
#5 0x00007f0d23968dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f0d2369776d in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f0d22d9e700 (LWP 20497)):
#0 0x00007f0d2396f1bd in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f0d2396ad02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f0d2396ac08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000400973 in func2 () at deadlock.c:31
#4 0x00000000004009e2 in start_routine2 (arg=0x0) at deadlock.c:56
#5 0x00007f0d23968dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f0d2369776d in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f0d2259d700 (LWP 20498)):
#0 0x00007f0d2365e66d in nanosleep () from /lib64/libc.so.6
#1 0x00007f0d2365e504 in sleep () from /lib64/libc.so.6
#2 0x0000000000400a16 in start_routine (arg=0x400bf4) at deadlock.c:69
#3 0x00007f0d23968dc5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f0d2369776d in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f0d21d9c700 (LWP 20499)):
#0 0x00007f0d2365e66d in nanosleep () from /lib64/libc.so.6
#1 0x00007f0d2365e504 in sleep () from /lib64/libc.so.6
#2 0x0000000000400a16 in start_routine (arg=0x400bf4) at deadlock.c:69
#3 0x00007f0d23968dc5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f0d2369776d in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f0d23d80740 (LWP 20495)):
#0 0x00007f0d23969ef7 in pthread_join () from /lib64/libpthread.so.0
#1 0x0000000000400b17 in main () at deadlock.c:99
連續多次查看這個進程的函數調用關系堆棧,死鎖線程將一直處于等鎖狀態,對比多次的函數調用堆棧輸出結果,確定哪兩個線程(或者幾個線程)一直沒有變化且一直處于等鎖的狀態。
分析:
根據上面的輸出對比,線程1和線程2的pstack輸出表明其正處于sleep狀態,沒有問題。但是線程4和線程5一直處在等鎖狀態(pthread_mutex_lock),在連續兩次的pstack信息輸出中沒有發生變化,所以我們可以推測線程4和線程5發生了死鎖。
然后,我們通過gdb attach到死鎖進程:
# gdb attach 20495
GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-94.el7
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
...
attach: 沒有那個文件或目錄.
Attaching to process 20495
Reading symbols from /data/home/lzy/just_for_test/deadlock...done.
Reading symbols from /lib64/libpthread.so.0...(no debugging symbols found)...done.
[New LWP 20499]
[New LWP 20498]
[New LWP 20497]
[New LWP 20496]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Loaded symbols for /lib64/libpthread.so.0
Reading symbols from /lib64/libc.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib64/libc.so.6
Reading symbols from /lib64/ld-linux-x86-64.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
0x00007f0d23969ef7 in pthread_join () from /lib64/libpthread.so.0
Missing separate debuginfos, use: debuginfo-install glibc-2.17-157.el7_3.2.x86_64
(gdb) info thread
Id Target Id Frame
5 Thread 0x7f0d2359f700 (LWP 20496) "deadlock" 0x00007f0d2396f1bd in __lll_lock_wait () from /lib64/libpthread.so.0
4 Thread 0x7f0d22d9e700 (LWP 20497) "deadlock" 0x00007f0d2396f1bd in __lll_lock_wait () from /lib64/libpthread.so.0
3 Thread 0x7f0d2259d700 (LWP 20498) "deadlock" 0x00007f0d2365e66d in nanosleep () from /lib64/libc.so.6
2 Thread 0x7f0d21d9c700 (LWP 20499) "deadlock" 0x00007f0d2365e66d in nanosleep () from /lib64/libc.so.6
* 1 Thread 0x7f0d23d80740 (LWP 20495) "deadlock" 0x00007f0d23969ef7 in pthread_join () from /lib64/libpthread.so.0
查看線程4和線程5的輸出:
(gdb) thread 5
[Switching to thread 5 (Thread 0x7f0d2359f700 (LWP 20496))]
#0 0x00007f0d2396f1bd in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) where
#0 0x00007f0d2396f1bd in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f0d2396ad02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f0d2396ac08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000400917 in func1 () at deadlock.c:18
#4 0x00000000004009b4 in start_routine1 (arg=0x0) at deadlock.c:43
#5 0x00007f0d23968dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f0d2369776d in clone () from /lib64/libc.so.6
(gdb) frame 3
#3 0x0000000000400917 in func1 () at deadlock.c:18
18 pthread_mutex_lock(&mutexB);
(gdb) thread 4
[Switching to thread 4 (Thread 0x7f0d22d9e700 (LWP 20497))]
#0 0x00007f0d2396f1bd in __lll_lock_wait () from /lib64/libpthread.so.0
(gdb) where
#0 0x00007f0d2396f1bd in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f0d2396ad02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f0d2396ac08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000400973 in func2 () at deadlock.c:31
#4 0x00000000004009e2 in start_routine2 (arg=0x0) at deadlock.c:56
#5 0x00007f0d23968dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f0d2369776d in clone () from /lib64/libc.so.6
(gdb) frame 3
#3 0x0000000000400973 in func2 () at deadlock.c:31
31 pthread_mutex_lock(&mutexA);
(gdb) p mutexA
$1 = {__data = {__lock = 2, __count = 0, __owner = 20496, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\020P\000\000\001", '\000' , __align = 2}
(gdb) p mutexB
$2 = {__data = {__lock = 2, __count = 0, __owner = 20497, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\021P\000\000\001", '\000' , __align = 2}
(gdb)
從上面可以發現,線程4正試圖獲得鎖mutexA,但是鎖mutexA已經被LWP為20496的線程得到(__owner=20496),線程5正試圖獲得鎖mutextB,但是鎖mutexB已經被LWP為20497的線程得到(__owner=20497),從pstack的輸出可以發現LWP 20496與線程5對應,LWP 20496與線程4對應。所以我們可以得出,線程4和線程5發生了交叉持鎖的現象。查看線程的源代碼發現,線程4和線程5同時使用mutexA和mutexB,且申請順序不合理。
5. 利用core文件分析
運行./deadlock(編譯的時候加調試選項-g):
# ulimit -c
0
# ulimit -c unlimited
# ulimit -c
unlimited
# ./deadlock
# ps -ef | grep deadlock
root 30811 6820 0 14:32 pts/0 00:00:00 ./deadlock
[root@compile just_for_test]# pstack 30811
Thread 5 (Thread 0x7f14b4c78700 (LWP 30812)):
#0 0x00007f14b50481bd in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f14b5043d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f14b5043c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000400917 in func1 () at deadlock.c:18
#4 0x00000000004009b4 in start_routine1 (arg=0x0) at deadlock.c:43
#5 0x00007f14b5041dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f14b4d7076d in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f14b4477700 (LWP 30813)):
#0 0x00007f14b50481bd in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f14b5043d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f14b5043c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000400973 in func2 () at deadlock.c:31
#4 0x00000000004009e2 in start_routine2 (arg=0x0) at deadlock.c:56
#5 0x00007f14b5041dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f14b4d7076d in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f14b3c76700 (LWP 30814)):
#0 0x00007f14b4d3766d in nanosleep () from /lib64/libc.so.6
#1 0x00007f14b4d37504 in sleep () from /lib64/libc.so.6
#2 0x0000000000400a16 in start_routine (arg=0x400bf4) at deadlock.c:69
#3 0x00007f14b5041dc5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f14b4d7076d in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f14b3475700 (LWP 30815)):
#0 0x00007f14b4d3766d in nanosleep () from /lib64/libc.so.6
#1 0x00007f14b4d37504 in sleep () from /lib64/libc.so.6
#2 0x0000000000400a16 in start_routine (arg=0x400bf4) at deadlock.c:69
#3 0x00007f14b5041dc5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f14b4d7076d in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f14b5459740 (LWP 30811)):
#0 0x00007f14b5042ef7 in pthread_join () from /lib64/libpthread.so.0
#1 0x0000000000400b17 in main () at deadlock.c:99
按CTRL+\產生coredump:
# ls
core.30811 deadlock deadlock.c
然后再用gdb來調試該coredump文件:
# gdb ./deadlock core.30811
GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-94.el7
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
...
Reading symbols from /data/home/lzy/just_for_test/deadlock...done.
[New LWP 30811]
[New LWP 30814]
[New LWP 30813]
[New LWP 30812]
[New LWP 30815]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `./deadlock'.
Program terminated with signal 3, Quit.
#0 0x00007f14b5042ef7 in pthread_join () from /lib64/libpthread.so.0
Missing separate debuginfos, use: debuginfo-install glibc-2.17-157.el7_3.2.x86_64
(gdb)
(gdb)
(gdb) thread apply all bt
Thread 5 (Thread 0x7f14b3475700 (LWP 30815)):
#0 0x00007f14b4d3766d in nanosleep () from /lib64/libc.so.6
#1 0x00007f14b4d37504 in sleep () from /lib64/libc.so.6
#2 0x0000000000400a16 in start_routine (arg=0x400bf4) at deadlock.c:69
#3 0x00007f14b5041dc5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f14b4d7076d in clone () from /lib64/libc.so.6
Thread 4 (Thread 0x7f14b4c78700 (LWP 30812)):
#0 0x00007f14b50481bd in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f14b5043d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f14b5043c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000400917 in func1 () at deadlock.c:18
#4 0x00000000004009b4 in start_routine1 (arg=0x0) at deadlock.c:43
#5 0x00007f14b5041dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f14b4d7076d in clone () from /lib64/libc.so.6
Thread 3 (Thread 0x7f14b4477700 (LWP 30813)):
#0 0x00007f14b50481bd in __lll_lock_wait () from /lib64/libpthread.so.0
#1 0x00007f14b5043d02 in _L_lock_791 () from /lib64/libpthread.so.0
#2 0x00007f14b5043c08 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3 0x0000000000400973 in func2 () at deadlock.c:31
#4 0x00000000004009e2 in start_routine2 (arg=0x0) at deadlock.c:56
#5 0x00007f14b5041dc5 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f14b4d7076d in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7f14b3c76700 (LWP 30814)):
#0 0x00007f14b4d3766d in nanosleep () from /lib64/libc.so.6
#1 0x00007f14b4d37504 in sleep () from /lib64/libc.so.6
#2 0x0000000000400a16 in start_routine (arg=0x400bf4) at deadlock.c:69
#3 0x00007f14b5041dc5 in start_thread () from /lib64/libpthread.so.0
#4 0x00007f14b4d7076d in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7f14b5459740 (LWP 30811)):
#0 0x00007f14b5042ef7 in pthread_join () from /lib64/libpthread.so.0
#1 0x0000000000400b17 in main () at deadlock.c:99
(gdb)
從上面我們也可以看到相應的死鎖方面的信息。
6. 利用valgrind(DRD+Helgrind)來分析死鎖
下面我們將介紹如何使用valgrind來排查死鎖問題(說明: 這里DRD是Data Race Detection的縮寫)。我們先構造一個死鎖場景(dead_lock.c):
#include
pthread_mutex_t s_mutex_a;
pthread_mutex_t s_mutex_b;
pthread_barrier_t s_barrier;
void lock() {
pthread_mutex_lock(&s_mutex_b);
{
pthread_barrier_wait(&s_barrier);//10行
pthread_mutex_lock(&s_mutex_a);
pthread_mutex_unlock(&s_mutex_a);
}
pthread_mutex_unlock(&s_mutex_b);
}
static void* thread_routine(void* arg) {
pthread_mutex_lock(&s_mutex_a);
{
pthread_barrier_wait(&s_barrier);//21行
pthread_mutex_lock(&s_mutex_b);
pthread_mutex_unlock(&s_mutex_b);
}
pthread_mutex_unlock(&s_mutex_a);
}
int main(int argc, char** argv) {
pthread_t tid;
pthread_mutex_init(&s_mutex_a, 0);
pthread_mutex_init(&s_mutex_b, 0);
pthread_barrier_init(&s_barrier, 0, 2);
pthread_create(&tid, 0, &thread_routine, 0);
lock();
pthread_join(tid, 0);
pthread_cancel(tid);
pthread_barrier_destroy(&s_barrier);
pthread_mutex_destroy(&s_mutex_a);
pthread_mutex_destroy(&s_mutex_b);
return 0;
}
上面這段代碼我們只要關注lock和thread_routine兩個方法。lock()方法在主線程中執行,它先給s_mutex_b上鎖,然后通過屏障s_barrier等待線程也執行到屏障處(第21行);thread_routine()是線程函數,它先給s_mutex_a上鎖,然后通過屏障s_barrier等待
主線程也執行到屏障處(第10行)。
主線程和子線程都執行到屏障處后,屏障被打開,它們繼續向下執行: 主線程執行到第12行試圖獲取s_mutex_a;子線程執行到第23行試圖獲取s_mutex_b。由于這兩個互斥量已經被占用,所以產生死鎖。
這是通過代碼分析出來的,但是對于比較大的工程項目,我們則需要通過工具來分析。下面我們使用valgrind來分析:
# gcc -g -o dead_lock dead_lock.c -lpthread
# valgrind --tool=drd --trace-mutex=yes ./dead_lock
==9373== drd, a thread error detector
==9373== Copyright (C) 2006-2015, and GNU GPL'd, by Bart Van Assche.
==9373== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info
==9373== Command: ./dead_lock
==9373==
==9373== [1] mutex_init mutex 0x6010c0
==9373== [1] mutex_init mutex 0x601120
==9373== [1] mutex_init mutex 0xffeffff10
==9373== [1] mutex_ignore_ordering mutex 0xffeffff10
==9373== [1] mutex_trylock mutex 0xffeffff10 rc 0 owner 0
==9373== [1] post_mutex_lock mutex 0xffeffff10 rc 0 owner 0
==9373== [1] mutex_unlock mutex 0xffeffff10 rc 1
==9373== [2] mutex_trylock mutex 0xffeffff10 rc 0 owner 1
==9373== [2] post_mutex_lock mutex 0xffeffff10 rc 0 owner 1
==9373== [2] mutex_unlock mutex 0xffeffff10 rc 1
==9373== [2] mutex_trylock mutex 0x6010c0 rc 0 owner 0
==9373== [2] post_mutex_lock mutex 0x6010c0 rc 0 owner 0
==9373== [1] mutex_trylock mutex 0xffeffff10 rc 0 owner 2
==9373== [1] post_mutex_lock mutex 0xffeffff10 rc 0 owner 2
==9373== [1] mutex_unlock mutex 0xffeffff10 rc 1
==9373== [1] mutex_destroy mutex 0xffeffff10 rc 0 owner 1
==9373== [1] mutex_trylock mutex 0x601120 rc 0 owner 0
==9373== [1] post_mutex_lock mutex 0x601120 rc 0 owner 0
==9373== [1] mutex_trylock mutex 0x6010c0 rc 1 owner 2//18行
==9373== [2] mutex_trylock mutex 0x601120 rc 1 owner 1//19行
通過使用上面的指令,讓valgrind把互斥量相關的信息打印出來了。
第18行顯示線程1試圖給0x6010c0互斥量上鎖,但是該互斥量的所有者(owner)是線程2;
第19行顯示線程2試圖該0x601120互斥量上鎖,但是該互斥量的所有者(owner)是線程1;
如此,我們便可以確定這段程序卡住是因為死鎖導致的。但是DRD有個問題,不能指出發生死鎖的位置。這個時候Helgrind就該出場了:
# valgrind --tool=helgrind ./dead_lock
==14606== Helgrind, a thread error detector
==14606== Copyright (C) 2007-2015, and GNU GPL'd, by OpenWorks LLP et al.
==14606== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info
==14606== Command: ./dead_lock
==14606==
helgrind執行時,如果發生死鎖,需要ctrl+c來終止運行,于是可以得到如下結果:
^C==14606==
==14606== Process terminating with default action of signal 2 (SIGINT)
==14606== at 0x4E471BD: __lll_lock_wait (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4E42D01: _L_lock_791 (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4E42C06: pthread_mutex_lock (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4C2BC4B: mutex_lock_WRK (hg_intercepts.c:894)
==14606== by 0x4C2FB0D: pthread_mutex_lock (hg_intercepts.c:917)
==14606== by 0x400947: lock (dead_lock.c:12)
==14606== by 0x400A03: main (dead_lock.c:38)
==14606== ---Thread-Announcement------------------------------------------
==14606==
==14606== Thread #1 is the program's root thread
==14606==
==14606== ----------------------------------------------------------------
==14606==
==14606== Thread #1: Exiting thread still holds 1 lock
==14606== at 0x4E471BD: __lll_lock_wait (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4E42D01: _L_lock_791 (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4E42C06: pthread_mutex_lock (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4C2BC4B: mutex_lock_WRK (hg_intercepts.c:894)
==14606== by 0x4C2FB0D: pthread_mutex_lock (hg_intercepts.c:917)
==14606== by 0x400947: lock (dead_lock.c:12)//22行
==14606== by 0x400A03: main (dead_lock.c:38)
==14606==
==14606== ---Thread-Announcement------------------------------------------
==14606==
==14606== Thread #2 was created
==14606== at 0x514C72E: clone (in /usr/lib64/libc-2.17.so)
==14606== by 0x4E3FF79: do_clone.constprop.4 (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4E41468: pthread_create@@GLIBC_2.2.5 (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4C2E64A: pthread_create_WRK (hg_intercepts.c:427)
==14606== by 0x4C2F728: pthread_create@* (hg_intercepts.c:460)
==14606== by 0x4009F9: main (dead_lock.c:36)
==14606==
==14606== ----------------------------------------------------------------
==14606==
==14606== Thread #2: Exiting thread still holds 1 lock
==14606== at 0x4E471BD: __lll_lock_wait (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4E42D01: _L_lock_791 (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4E42C06: pthread_mutex_lock (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x4C2BC4B: mutex_lock_WRK (hg_intercepts.c:894)
==14606== by 0x4C2FB0D: pthread_mutex_lock (hg_intercepts.c:917)
==14606== by 0x400987: thread_routine (dead_lock.c:23)//43行
==14606== by 0x4C2E83E: mythread_wrapper (hg_intercepts.c:389)
==14606== by 0x4E40DC4: start_thread (in /usr/lib64/libpthread-2.17.so)
==14606== by 0x514C76C: clone (in /usr/lib64/libc-2.17.so)
==14606==
==14606==
==14606== For counts of detected and suppressed errors, rerun with: -v
==14606== Use --history-level=approx or =none to gain increased speed, at
==14606== the cost of reduced accuracy of conflicting-access information
==14606== ERROR SUMMARY: 2 errors from 2 contexts (suppressed: 2 from 2)
已殺死
第22行和第43行分別顯示了主線程和子線程在中斷之前,都鎖在哪一行。這樣就更容易定位問題了。
[參看]
總結
以上是生活随笔為你收集整理的linux将程序锁死,Linux死锁现象及分析方法(转)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: var目录详解
- 下一篇: linux系统月初月末,Linux la