[performace] recursive v.s. non-recursive

參觀人次統計

今日人氣: 32

累積人氣: 188105

最新文章

[C] VC compiler的最佳化

[AI] 讓 langchain_openai 可以支援 reasoning_content

[AI] 可以關掉 Qwen3.5 thinking mode

[AI] build llama.cpp in windows

[python] 讓 LLM 的 query 可以留下 SSLKEYLOGFILE

2009 06 05 24 23 [performace] recursive v.s. non-recursive

Update: 原來是因為 VC2005 最佳化的關係，

它會把 fib1() 移出原本用 rdtsc() 包夾起來的位置，

導致量測不到正確的數字。

解決方法：加個 volatile 去讓 VC2005 不要對 fib1() 最佳化。

不過 VC2005 的確是變聰明了，

下次要測performance數據就得多小心了～～～

recursive:	fib1(30) =	832040	time =	71,891,699
non-recursive:	fib2(30) =	832040	time =	228,775,393
non-recursive:	fib3(30) =	832040	time =	39,393,574
non-recursive:	fib4(30) =	832040	time =	97,573,982

整件事的起因只是在某次會議上，

有位同事在說明程式上的問題，

順道提起說 non-recursive （非遞迴）比 recursive （遞迴）慢。

我問了一下，為什麼會這麼說，

他回答說用 stl stack 去 implement non-recursive 會比較慢。

另外一位同事寫了一小段code去驗證這件事，

他用的是 Fibonacci number 當作例子，

而得到的結論是 non-recursive 慢了 60-70 倍。

的確，用 C++ 來 implement (實作) stack 是很有可能比較慢。

但是如果用純C 來 implement (實作) stack 又會怎麼樣呢?

我用VC2005編譯下面的原始碼，

測試了 Release 版和 Debug 版的結果。

先來看 Debug 版的結果，

不出所料，用純C non-recursive 的確有可能打敗recursive的作法。

    recursive: fib1(30) = 832040, time =    618,553,507
non-recursive: fib2(30) = 832040, time = 46,321,047,773
non-recursive: fib3(30) = 832040, time =    194,134,798
non-recursive: fib4(30) = 832040, time =   2,690,298,468
    recursive: fib1(30) = 832040, time =    645,310,655
non-recursive: fib2(30) = 832040, time = 46,226,320,294
non-recursive: fib3(30) = 832040, time =    194,315,847
non-recursive: fib4(30) = 832040, time =   2,762,798,214
    recursive: fib1(30) = 832040, time =    636,833,516
non-recursive: fib2(30) = 832040, time = 46,155,658,252
non-recursive: fib3(30) = 832040, time =    207,529,036
non-recursive: fib4(30) = 832040, time =   2,812,436,374

不過切換到 Release 版，

卻得到一個奇特的結果。

    recursive: fib1(30) = 832040, time =          495
non-recursive: fib2(30) = 832040, time =    231,199,188
non-recursive: fib3(30) = 832040, time =     37,900,918
non-recursive: fib4(30) = 832040, time =    103,971,483
    recursive: fib1(30) = 832040, time =          495
non-recursive: fib2(30) = 832040, time =    229,024,697
non-recursive: fib3(30) = 832040, time =     37,525,301
non-recursive: fib4(30) = 832040, time =    103,438,654
    recursive: fib1(30) = 832040, time =          495
non-recursive: fib2(30) = 832040, time =    253,850,311
non-recursive: fib3(30) = 832040, time =     38,306,884
non-recursive: fib4(30) = 832040, time =    111,237,841

recursive 版本快到不可思議。

讓我們來看看編譯的Machine Codes.

這是 Release 版的 fib1 ()

PUBLIC    ?fib1@@YAKK@Z                    ; fib1
; Function compile flags: /Ogtpy
;    COMDAT ?fib1@@YAKK@Z
_TEXT    SEGMENT
_n$ = 8                            ; size = 4
?fib1@@YAKK@Z PROC                    ; fib1, COMDAT

; 20   : {

00000    56        push    esi

; 21   :
; 22   :     if (n <= 1)

00001    8b 74 24 08    mov    esi, DWORD PTR _n$[esp]
00005    83 fe 01    cmp    esi, 1
00008    77 04        ja    SHORT $LN2@fib1

; 23   :     {
; 24   :         return n;

0000a    8b c6        mov    eax, esi
0000c    5e        pop    esi

; 29   :     }
; 30   : }

0000d    c3        ret    0
$LN2@fib1:

; 25   :     }
; 26   :     else
; 27   :     {
; 28   :         return fib1(n-1)+fib1(n-2);

0000e    8d 46 fe    lea    eax, DWORD PTR [esi-2]
00011    57        push    edi
00012    50        push    eax
00013    e8 00 00 00 00    call    ?fib1@@YAKK@Z        ; fib1
00018    83 c6 ff    add    esi, -1
0001b    56        push    esi
0001c    8b f8        mov    edi, eax
0001e    e8 00 00 00 00    call    ?fib1@@YAKK@Z        ; fib1
00023    83 c4 08    add    esp, 8
00026    03 c7        add    eax, edi
00028    5f        pop    edi
00029    5e        pop    esi

; 29   :     }
; 30   : }

0002a    c3        ret    0
?fib1@@YAKK@Z ENDP

而這是 Debug 版的 fib1() 。

;    COMDAT ?fib1@@YAKK@Z
_TEXT    SEGMENT
_n$ = 8                            ; size = 4
?fib1@@YAKK@Z PROC                    ; fib1, COMDAT

; 20   : {

00000    55        push    ebp
00001    8b ec        mov    ebp, esp
00003    81 ec c0 00 00
    00        sub    esp, 192        ; 000000c0H
00009    53        push    ebx
0000a    56        push    esi
0000b    57        push    edi
0000c    8d bd 40 ff ff
    ff        lea    edi, DWORD PTR [ebp-192]
00012    b9 30 00 00 00    mov    ecx, 48            ; 00000030H
00017    b8 cc cc cc cc    mov    eax, -858993460        ; ccccccccH
0001c    f3 ab        rep stosd

; 21   :
; 22   :     if (n <= 1)

0001e    83 7d 08 01    cmp    DWORD PTR _n$[ebp], 1
00022    77 07        ja    SHORT $LN2@fib1

; 23   :     {
; 24   :         return n;

00024    8b 45 08    mov    eax, DWORD PTR _n$[ebp]
00027    eb 24        jmp    SHORT $LN3@fib1

; 25   :     }
; 26   :     else

00029    eb 22        jmp    SHORT $LN3@fib1
$LN2@fib1:

; 27   :     {
; 28   :         return fib1(n-1)+fib1(n-2);

0002b    8b 45 08    mov    eax, DWORD PTR _n$[ebp]
0002e    83 e8 01    sub    eax, 1
00031    50        push    eax
00032    e8 00 00 00 00    call    ?fib1@@YAKK@Z        ; fib1
00037    83 c4 04    add    esp, 4
0003a    8b f0        mov    esi, eax
0003c    8b 4d 08    mov    ecx, DWORD PTR _n$[ebp]
0003f    83 e9 02    sub    ecx, 2
00042    51        push    ecx
00043    e8 00 00 00 00    call    ?fib1@@YAKK@Z        ; fib1
00048    83 c4 04    add    esp, 4
0004b    03 c6        add    eax, esi
$LN3@fib1:

; 29   :     }
; 30   : }

0004d    5f        pop    edi
0004e    5e        pop    esi
0004f    5b        pop    ebx
00050    81 c4 c0 00 00
    00        add    esp, 192        ; 000000c0H
00056    3b ec        cmp    ebp, esp
00058    e8 00 00 00 00    call    __RTC_CheckEsp
0005d    8b e5        mov    esp, ebp
0005f    5d        pop    ebp
00060    c3        ret    0
?fib1@@YAKK@Z ENDP                    ; fib1
_TEXT    ENDS

可以很清楚的看到 Release 版 fib1() 已經最佳化了，

沒有所謂 register 不夠用的問題。

618,553,507 / 495 = 1,249,603.04444444

所以才會造成 Release 版比 Debug 版快了 124萬倍。(誇張的數字～～～)

不過有點離題了。

總之，recursive 和 non-recursive 理論上是差不了多少，

因為 recursive 如果每次 function call 都需要重新 initial 變數，

以及本身 call function 也是需要時間。

而 non-recursive 則是需要時間來 allocate 記憶體來當作 stack 使用。

也可以看出來 std::stack 是很好用沒錯，

不過他本身的 performance 還是比不上純C 直接 inline 進去。

std::stack 的performance 也輸掉我自己 implement c++ stack，

這倒是讓我有點意料之外。

原本以為應該差不了多少，

看了一下 std::stack 的 source codes ，

才發現它是用 dqueue 來實作，

這也難怪會這樣了～～～

#include "Windows.h"
#include "stack"

static unsigned __int64 rdtsc(void) {
   __asm cpuid;
   __asm rdtsc;
}
 

// recursive implementation
unsigned long fib1(volatile unsigned long n)
{
    if (n <= 1)
    {
        return n;
    }
    else
    {
        return fib1(n-1)+fib1(n-2);
    }
}

 

//  non-recursive std::stack implementation
unsigned long fib2(unsigned long n)
{
    std::stack stack;
    unsigned long result = 0;
    stack.push(n);
    while (false == stack.empty())
    {
        unsigned long m = stack.top();
        stack.pop();
        if (m <= 1) {
            result += m;
        } else {
            stack.push(m-1);
            stack.push(m-2);
        }
    }
    return result;
}

// non-recursive C-style implementation
unsigned long fib3(unsigned long n)
{
#define STACK_SIZE 128
    unsigned long *pStack = NULL;
    unsigned long result = 0;
    int nStack=0;
    pStack = (unsigned long *)malloc(STACK_SIZE*sizeof(unsigned long));

#define STACK_PUSH(n) {pStack[nStack]=n;nStack++;if (nStack>STACK_SIZE) {result = -1;goto END;}}
#define STACK_EMPTY() (nStack<=0)
#define STACK_TOP()   (pStack[nStack-1])
#define STACK_POP()   {nStack--;}

    STACK_PUSH(n);
    while (false == STACK_EMPTY())
    {
        unsigned long m = STACK_TOP();

        STACK_POP();
        if (m <= 1) {
            result += m;
        } else {
            STACK_PUSH(m-1);
            STACK_PUSH(m-2);
        }
    }

END:
	if (pStack) {
		free(pStack);
	}
    return result;

} 

// non-recursive C++-Style implementation
class stack_int
{
public:
    stack_int(int size);
    ~stack_int();
    int push(unsigned long n);
    unsigned long pop();
    unsigned long top();
    bool empty();
private:
    unsigned long *pStack;
    int nStack;
    int nStackSize;
};

stack_int::stack_int(int size)
{
	pStack = (unsigned long*) malloc(size*sizeof(unsigned long));
	nStack = 0;
	nStackSize = size;
}

stack_int::~stack_int()
{
	if (pStack) free(pStack);
}

inline int stack_int::push(unsigned long n)
{
	if (pStack == NULL) {
		return -1;
	}
	if (nStack>=nStackSize ) {
		return -1;
	}
	pStack[nStack]=n;
	nStack++;
}

inline  unsigned long stack_int::top()
{
	if (pStack == NULL) return 0;
	if (nStack <= 0) return 0;
	return pStack[nStack-1];
}

inline unsigned long stack_int::pop()
{
	nStack--;
	return 0;
}

inline  bool stack_int::empty()
{
	if (nStack>0) return false;
	return true;
}


unsigned long fib4(unsigned long n)

{
    stack_int *pStackInt = new stack_int(512);
    unsigned long result = 0;
    
    pStackInt->push(n);
    while (false == pStackInt->empty())
    {
        unsigned long m = pStackInt->top();
        pStackInt->pop();
        if (m <= 1) {
            result += m;
        } else {
            pStackInt->push(m-1);
            pStackInt->push(m-2);
        }
    }

    delete (pStackInt);
    return result;
} 


int main(int argc, char* argv[])

{
    unsigned int depth = 30;
    int i;
    int result;

    __int64 nStart;
    __int64 nEnd;


    HANDLE hThread =GetCurrentThread();
    if (SetThreadAffinityMask(hThread ,1)==0) {
        printf("SetThreadAffinityMask Errorn");
    }


    for (i=0;i<10;i++) {
        {
            nStart = rdtsc();
            result = fib1(depth);
            nEnd = rdtsc();
            printf("    recursive: fib1(%u) = %u, time = %12I64d\n", depth, result, nEnd-nStart);
        }
        if (1) {
            nStart = rdtsc();
            result = fib2(depth);
            nEnd = rdtsc();
            printf("non-recursive: fib2(%u) = %u, time = %12I64d\n", depth, result, nEnd-nStart);
        }

        {
            nStart = rdtsc();
            result = fib3(depth);
            nEnd = rdtsc();
            printf("non-recursive: fib3(%u) = %u, time = %12I64d\n", depth, result, nEnd-nStart);
        }
        {
            nStart = rdtsc();
            result = fib4(depth);
            nEnd = rdtsc();
            printf("non-recursive: fib4(%u) = %u, time = %12I64d\n", depth, result, nEnd-nStart);
        }

    }
    getchar();
    return 0;
}

上一頁 130 131 132 133 134 135 136 137 138 139 下一頁