Update: 原來是因為 VC2005 最佳化的關係,
它會把 fib1() 移出原本用 rdtsc() 包夾起來的位置,
導致量測不到正確的數字。
解決方法: 加個 volatile 去讓 VC2005 不要對 fib1() 最佳化。
不過 VC2005 的確是變聰明了,
下次要測performance數據就得多小心了~~~
| recursive: | fib1(30) = | 832040 | time = | 71,891,699 |
| non-recursive: | fib2(30) = | 832040 | time = | 228,775,393 |
| non-recursive: | fib3(30) = | 832040 | time = | 39,393,574 |
| non-recursive: | fib4(30) = | 832040 | time = | 97,573,982 |
整件事的起因只是在某次會議上,
有位同事在說明程式上的問題,
順道提起說 non-recursive (非遞迴)比 recursive (遞迴)慢。
我問了一下,為什麼會這麼說,
他回答說用 stl stack 去 implement non-recursive 會比較慢。
另外一位同事寫了一小段code去驗證這件事,
他用的是 Fibonacci number 當作例子,
而得到的結論是 non-recursive 慢了 60-70 倍。
的確,用 C++ 來 implement (實作) stack 是很有可能比較慢。
但是如果用 純C 來 implement (實作) stack 又會怎麼樣呢?
我用VC2005編譯下面的原始碼,
測試了 Release 版和 Debug 版的結果。
先來看 Debug 版的結果,
不出所料,用純C non-recursive 的確有可能打敗recursive的作法。
recursive: fib1(30) = 832040, time = 618,553,507
non-recursive: fib2(30) = 832040, time = 46,321,047,773
non-recursive: fib3(30) = 832040, time = 194,134,798
non-recursive: fib4(30) = 832040, time = 2,690,298,468
recursive: fib1(30) = 832040, time = 645,310,655
non-recursive: fib2(30) = 832040, time = 46,226,320,294
non-recursive: fib3(30) = 832040, time = 194,315,847
non-recursive: fib4(30) = 832040, time = 2,762,798,214
recursive: fib1(30) = 832040, time = 636,833,516
non-recursive: fib2(30) = 832040, time = 46,155,658,252
non-recursive: fib3(30) = 832040, time = 207,529,036
non-recursive: fib4(30) = 832040, time = 2,812,436,374
不過切換到 Release 版,
卻得到一個奇特的結果。
recursive: fib1(30) = 832040, time = 495
non-recursive: fib2(30) = 832040, time = 231,199,188
non-recursive: fib3(30) = 832040, time = 37,900,918
non-recursive: fib4(30) = 832040, time = 103,971,483
recursive: fib1(30) = 832040, time = 495
non-recursive: fib2(30) = 832040, time = 229,024,697
non-recursive: fib3(30) = 832040, time = 37,525,301
non-recursive: fib4(30) = 832040, time = 103,438,654
recursive: fib1(30) = 832040, time = 495
non-recursive: fib2(30) = 832040, time = 253,850,311
non-recursive: fib3(30) = 832040, time = 38,306,884
non-recursive: fib4(30) = 832040, time = 111,237,841
recursive 版本快到不可思議。
讓我們來看看編譯的Machine Codes.
這是 Release 版的 fib1 ()
PUBLIC ?fib1@@YAKK@Z ; fib1
; Function compile flags: /Ogtpy
; COMDAT ?fib1@@YAKK@Z
_TEXT SEGMENT
_n$ = 8 ; size = 4
?fib1@@YAKK@Z PROC ; fib1, COMDAT
; 20 : {
00000 56 push esi
; 21 :
; 22 : if (n <= 1)
00001 8b 74 24 08 mov esi, DWORD PTR _n$[esp]
00005 83 fe 01 cmp esi, 1
00008 77 04 ja SHORT $LN2@fib1
; 23 : {
; 24 : return n;
0000a 8b c6 mov eax, esi
0000c 5e pop esi
; 29 : }
; 30 : }
0000d c3 ret 0
$LN2@fib1:
; 25 : }
; 26 : else
; 27 : {
; 28 : return fib1(n-1)+fib1(n-2);
0000e 8d 46 fe lea eax, DWORD PTR [esi-2]
00011 57 push edi
00012 50 push eax
00013 e8 00 00 00 00 call ?fib1@@YAKK@Z ; fib1
00018 83 c6 ff add esi, -1
0001b 56 push esi
0001c 8b f8 mov edi, eax
0001e e8 00 00 00 00 call ?fib1@@YAKK@Z ; fib1
00023 83 c4 08 add esp, 8
00026 03 c7 add eax, edi
00028 5f pop edi
00029 5e pop esi
; 29 : }
; 30 : }
0002a c3 ret 0
?fib1@@YAKK@Z ENDP
而這是 Debug 版的 fib1() 。
; COMDAT ?fib1@@YAKK@Z
_TEXT SEGMENT
_n$ = 8 ; size = 4
?fib1@@YAKK@Z PROC ; fib1, COMDAT
; 20 : {
00000 55 push ebp
00001 8b ec mov ebp, esp
00003 81 ec c0 00 00
00 sub esp, 192 ; 000000c0H
00009 53 push ebx
0000a 56 push esi
0000b 57 push edi
0000c 8d bd 40 ff ff
ff lea edi, DWORD PTR [ebp-192]
00012 b9 30 00 00 00 mov ecx, 48 ; 00000030H
00017 b8 cc cc cc cc mov eax, -858993460 ; ccccccccH
0001c f3 ab rep stosd
; 21 :
; 22 : if (n <= 1)
0001e 83 7d 08 01 cmp DWORD PTR _n$[ebp], 1
00022 77 07 ja SHORT $LN2@fib1
; 23 : {
; 24 : return n;
00024 8b 45 08 mov eax, DWORD PTR _n$[ebp]
00027 eb 24 jmp SHORT $LN3@fib1
; 25 : }
; 26 : else
00029 eb 22 jmp SHORT $LN3@fib1
$LN2@fib1:
; 27 : {
; 28 : return fib1(n-1)+fib1(n-2);
0002b 8b 45 08 mov eax, DWORD PTR _n$[ebp]
0002e 83 e8 01 sub eax, 1
00031 50 push eax
00032 e8 00 00 00 00 call ?fib1@@YAKK@Z ; fib1
00037 83 c4 04 add esp, 4
0003a 8b f0 mov esi, eax
0003c 8b 4d 08 mov ecx, DWORD PTR _n$[ebp]
0003f 83 e9 02 sub ecx, 2
00042 51 push ecx
00043 e8 00 00 00 00 call ?fib1@@YAKK@Z ; fib1
00048 83 c4 04 add esp, 4
0004b 03 c6 add eax, esi
$LN3@fib1:
; 29 : }
; 30 : }
0004d 5f pop edi
0004e 5e pop esi
0004f 5b pop ebx
00050 81 c4 c0 00 00
00 add esp, 192 ; 000000c0H
00056 3b ec cmp ebp, esp
00058 e8 00 00 00 00 call __RTC_CheckEsp
0005d 8b e5 mov esp, ebp
0005f 5d pop ebp
00060 c3 ret 0
?fib1@@YAKK@Z ENDP ; fib1
_TEXT ENDS
可以很清楚的看到 Release 版 fib1() 已經最佳化了,
沒有所謂 register 不夠用的問題。
618,553,507 / 495 = 1,249,603.04444444
所以才會造成 Release 版比 Debug 版快了 124萬倍。(誇張的數字~~~)
不過有點離題了。
總之,recursive 和 non-recursive 理論上是差不了多少,
因為 recursive 如果每次 function call 都需要重新 initial 變數,
以及本身 call function 也是需要時間。
而 non-recursive 則是需要時間來 allocate 記憶體來當作 stack 使用。
也可以看出來 std::stack 是很好用沒錯,
不過他本身的 performance 還是比不上 純C 直接 inline 進去。
std::stack 的performance 也輸掉我自己 implement c++ stack,
這倒是讓我有點意料之外。
原本以為應該差不了多少,
看了一下 std::stack 的 source codes ,
才發現它是用 dqueue 來實作,
這也難怪會這樣了~~~
#include "Windows.h"
#include "stack"
static unsigned __int64 rdtsc(void) {
__asm cpuid;
__asm rdtsc;
}
// recursive implementation
unsigned long fib1(volatile unsigned long n)
{
if (n <= 1)
{
return n;
}
else
{
return fib1(n-1)+fib1(n-2);
}
}
// non-recursive std::stack implementation
unsigned long fib2(unsigned long n)
{
std::stack
unsigned long result = 0;
stack.push(n);
while (false == stack.empty())
{
unsigned long m = stack.top();
stack.pop();
if (m <= 1) {
result += m;
} else {
stack.push(m-1);
stack.push(m-2);
}
}
return result;
}
// non-recursive C-style implementation
unsigned long fib3(unsigned long n)
{
#define STACK_SIZE 128
unsigned long *pStack = NULL;
unsigned long result = 0;
int nStack=0;
pStack = (unsigned long *)malloc(STACK_SIZE*sizeof(unsigned long));
#define STACK_PUSH(n) {pStack[nStack]=n;nStack++;if (nStack>STACK_SIZE) {result = -1;goto END;}}
#define STACK_EMPTY() (nStack<=0)
#define STACK_TOP() (pStack[nStack-1])
#define STACK_POP() {nStack--;}
STACK_PUSH(n);
while (false == STACK_EMPTY())
{
unsigned long m = STACK_TOP();
STACK_POP();
if (m <= 1) {
result += m;
} else {
STACK_PUSH(m-1);
STACK_PUSH(m-2);
}
}
END:
if (pStack) {
free(pStack);
}
return result;
}
// non-recursive C++-Style implementation
class stack_int
{
public:
stack_int(int size);
~stack_int();
int push(unsigned long n);
unsigned long pop();
unsigned long top();
bool empty();
private:
unsigned long *pStack;
int nStack;
int nStackSize;
};
stack_int::stack_int(int size)
{
pStack = (unsigned long*) malloc(size*sizeof(unsigned long));
nStack = 0;
nStackSize = size;
}
stack_int::~stack_int()
{
if (pStack) free(pStack);
}
inline int stack_int::push(unsigned long n)
{
if (pStack == NULL) {
return -1;
}
if (nStack>=nStackSize ) {
return -1;
}
pStack[nStack]=n;
nStack++;
}
inline unsigned long stack_int::top()
{
if (pStack == NULL) return 0;
if (nStack <= 0) return 0;
return pStack[nStack-1];
}
inline unsigned long stack_int::pop()
{
nStack--;
return 0;
}
inline bool stack_int::empty()
{
if (nStack>0) return false;
return true;
}
unsigned long fib4(unsigned long n)
{
stack_int *pStackInt = new stack_int(512);
unsigned long result = 0;
pStackInt->push(n);
while (false == pStackInt->empty())
{
unsigned long m = pStackInt->top();
pStackInt->pop();
if (m <= 1) {
result += m;
} else {
pStackInt->push(m-1);
pStackInt->push(m-2);
}
}
delete (pStackInt);
return result;
}
int main(int argc, char* argv[])
{
unsigned int depth = 30;
int i;
int result;
__int64 nStart;
__int64 nEnd;
HANDLE hThread =GetCurrentThread();
if (SetThreadAffinityMask(hThread ,1)==0) {
printf("SetThreadAffinityMask Errorn");
}
for (i=0;i<10;i++) {
{
nStart = rdtsc();
result = fib1(depth);
nEnd = rdtsc();
printf(" recursive: fib1(%u) = %u, time = %12I64d\n", depth, result, nEnd-nStart);
}
if (1) {
nStart = rdtsc();
result = fib2(depth);
nEnd = rdtsc();
printf("non-recursive: fib2(%u) = %u, time = %12I64d\n", depth, result, nEnd-nStart);
}
{
nStart = rdtsc();
result = fib3(depth);
nEnd = rdtsc();
printf("non-recursive: fib3(%u) = %u, time = %12I64d\n", depth, result, nEnd-nStart);
}
{
nStart = rdtsc();
result = fib4(depth);
nEnd = rdtsc();
printf("non-recursive: fib4(%u) = %u, time = %12I64d\n", depth, result, nEnd-nStart);
}
}
getchar();
return 0;
}