see also:
- This is a much more comprehensive comparison program and article, but without hard results
- This contains some info about the V-table.
- This detailed article on the matter
Here's a
very simple program to try and measure the cost of using virtual functions.
The
Rectangle and
Square classes derive from
Shape. The
area() function call is made polymorphically.
The
Rectangle_plain class does a poor-man's polymorphism. That is, it holds on to a flag to tell it whether it is a square or rectangle and does the right thing for you. This is probably not totally fair, since the fake polymorphism just adds a compare. Still, this shows a sort of worst case comparison.
Here's the results on an Intel Nocona Xeon 3.4 in 32 bit mode. You'll want to compile it like this:
icc -DDO_PLAIN_FUNCS -O3 vfunc.cpp
| Compiler setup |
virtual ms/call |
non-virtual ms/call |
plain function ms/call |
non-virtual speedup |
plain function speedup |
icc -O0 |
.022 |
.018 |
.0062 |
1.2x |
3.5x |
icc -O2 |
.0184 |
.0135 |
.005 |
1.36x |
31x |
icc -O3 |
.0183 |
.0136 |
.0005 |
1.34x |
37x |
icc -O3 -prof_use |
.0184 |
.0138 |
.0006 |
1.33x |
31x |
icc -O3 -ip |
.0184 |
.0136 |
.0005 |
1.35x |
36.8x |
icc -O3 -ip -prof_use |
.0153 |
.0138 |
.0006 |
1.1x |
25.5x |
icc -O3 -ipo |
.0157 |
.0136 |
.0005 |
1.15x |
31x |
icc -O3 -ipo -prof_use |
.0136 |
.0137 |
.0005 |
1x |
27x |
Although this is a very small, contrived test...
- I have been told that taking off the plain function test helps performance. I haven't been able to get back to the machine where I did the testing. But to try it, simply compile without the
-DDO_PLAIN_FUNCS.
- There is consistently more overhead of like 20-36% for a virutal call
- Don't ask me why, but somehow
-ipo helped a single file program (?!?). The compiler spits out... IPO: performing single-file optimizations. I guess IPO can help a single file as well.
-
PGO helped virtual function calls significantly!
-
-O3 -ip is no better than just -O3. But -O3 -ip -prof_use is measurably better than -O3 -ip / -O3. The profile data is helping the 'ip' work. This is a documented feature (I think the compiler makes inlining decisions based on PGO) but seems strange that PGO would be needed on such a simple program.
- Non-intuitively, on a single file,
-O3 -ipo is superior to -O3 -ip.
- Ditto for
-O3 -ipo -prof_use vs. -O3 -ip -prof_use.
#include <iostream>
#include <time.h>
using namespace std;
// Size of our array to walk through
#define ITERS 1000000
// how many times we'll do the test for each array
#define REPS 100
class Rectangle_plain
{
private:
int height;
int width;
int is_square;
public:
Rectangle_plain(int h_in) { height = h_in; is_square = 1; }
Rectangle_plain(int h_in, int w_in) { height = h_in; width = w_in; is_square = 0;}
int area()
{ if (is_square) return height*height;
else return height*width;
}
~Rectangle_plain() {}
};
class Shape
{
public:
Shape() {}
virtual ~Shape() {}
virtual int area() = 0;
};
class Rectangle : public Shape
{
private:
int height, width;
public:
Rectangle(int h_in, int w_in) : Shape()
{ height = h_in;
width = w_in;
}
int area() { return height*width; }
~Rectangle() {}
};
class Square : public Shape
{
private:
int height;
public:
Square(int h_in) : Shape() { height = h_in; }
int area() { return height*height; }
~Square() {}
};
int compute_area(int h, int w)
{ return h*w;
}
int main(void)
{
long tot_area;
clock_t start, end;
Shape** ptrs =
(Shape**)malloc(ITERS*sizeof(Shape**));
Rectangle_plain** ptrs_plain =
(Rectangle_plain**)malloc(ITERS*sizeof(Rectangle_plain**));
/* Make up a random bunch of both virtual function-based and
* non-virtual function-based objects in a random list
*/
for (long i = 0; i < ITERS; i++)
{
int which = (int) (2.0*rand()/(RAND_MAX+1.0));
if (which)
{ ptrs[i] = new Square(which + 1);
ptrs_plain[i] = new Rectangle_plain(which + 1);
}
else
{ ptrs[i] = new Rectangle(which + 1, which + 2);
ptrs_plain[i] = new Rectangle_plain(which + 1, which + 2);
}
}
tot_area = 0;
start = clock();
for (long j = 0; j < REPS; j++)
for (long i = 0; i < ITERS; i++)
{ tot_area += ptrs[i]->area();
}
end=clock();
float calls_per_sec_vfunc = (float)(end-start)*1000000/CLOCKS_PER_SEC/ITERS/REPS;
printf("Time for virtual functions: %f, %f ms/call checksum: %ld\n",
(float)(end-start)/CLOCKS_PER_SEC,
calls_per_sec_vfunc,
tot_area);
tot_area = 0;
start = clock();
for (long j = 0; j < REPS; j++)
for (long i = 0; i < ITERS; i++)
{ tot_area += ptrs_plain[i]->area();
}
end=clock();
float calls_per_sec_class = (float)(end-start)*1000000/CLOCKS_PER_SEC/ITERS/REPS;
printf("Time for ordinary class functions: %f, %f ms/call checksum: %ld\n",
(float)(end-start)/CLOCKS_PER_SEC,
calls_per_sec_class,
tot_area);
#ifdef DO_PLAIN_FUNCS
tot_area = 0;
start = clock();
for (long j = 0; j < REPS; j++)
for (long i = 0; i < ITERS; i++)
{ tot_area += compute_area(1, i+1);
}
end=clock();
float calls_per_sec_func = (float)(end-start)*1000000/CLOCKS_PER_SEC/ITERS/REPS;
printf("Time for ordinary functions: %f, %f ms/call %ld\n",
(float)(end-start)/CLOCKS_PER_SEC,
calls_per_sec_func,
tot_area);
#endif
cout << endl << "Non-virtuals ran " << calls_per_sec_vfunc / calls_per_sec_class << "x faster than virtuals" << endl;
#ifdef DO_PLAIN_FUNCS
cout << "Plain calls ran " << calls_per_sec_vfunc / calls_per_sec_func << "x faster than virtuals" << endl;
#endif
for (long i = 0; i < ITERS; i++)
{ delete ptrs[i];
delete ptrs_plain[i];
}
free(ptrs);
free(ptrs_plain);
}
--
MattWalsh - 24 Nov 2004