The Program
int main(void)
{
int i;
for (i = 2; i < 10; i++)
{ if (i > 4)
{ printf("%d ", i);
} } }
Assembler dump - gcc -O0
Things to note:
- We store the start of our stack in
rbp
- Then we store our loop variable in this stack space (at
rbp - 4)
- in em64t mode, shorts are 2, ints are 4 bytes, and longs are 8
- in ia32 mode, shorts are 2, ints are 4 bytes, and longs are 4
- We shift the stack pointer by 16 bytes. Now, we are not actually using 16 bytes of space - we only need a couple of bytes. But it seems the comiler reserves bytes in 16 byte chunks. It also seems to try and align arrays. That is, if you add
int a[3], even though it should pack in and fit in 4 bytes, it goes to 32 a byte reservation. But it still reserves 16 bytes for int a[2].
- If you go back and make the array index a
long, the operations go from ___l to ___q (quad word).
- Seems strange to increment the counter in 2 steps instead of one (
incl -4(%rbp) )
- ...and the impact is mysterious (see below)
main:
.LFB2:
# push quadword 'rbp' on to stack (memory @ rsp)
# note: if you just do 'push' it seems to choose the apropos instruction
# but if you do 'pushb' you get an "Error: suffix or operands invalid for `push'"
pushq %rbp
.LCFI0:
movq %rsp, %rbp
.LCFI1:
subq $16, %rsp
.LCFI2:
# initialize our counter (i)
movl $2, -4(%rbp)
.L2:
cmpl $9, -4(%rbp)
jg .L3
cmpl $4, -4(%rbp)
jle .L4
movl -4(%rbp), %esi
movl $.LC0, %edi
movl $0, %eax
call printf
.L4:
# increment the counter
# load the effective qword address into rax. That is, we take the value in rbp
# and subtract 4 and store it in rax
# then increment the value stored in memory pointed to by rax
leaq -4(%rbp), %rax
incl (%rax)
# loop back
jmp .L2
.L3:
leave
ret
cycle count tests - simple loop
I've tried different ways to increment the counter, but they all come out the same with this stripped down program.
movl $0, -4(%rbp)
.L2:
cmpl $10000, -4(%rbp) # (vary the $10000 for iteration count)
jg .L3
leaq -4(%rbp), %rax
incl (%rax)
jmp .L2
| iterations |
cycles |
| 1 |
180 |
| 10 |
234 |
| 100 |
820 |
| 1,000 |
6200 |
| 10,000 |
60,200 |
| 100,000 |
600,900 |
| 1,000,000 |
6,045,000 |
| 10,000,000 |
60,400,000 |
cycle count tests - more complex loop
cpuid
rdtsc
movq %rax, -8(%rbp)
movl $0, -4(%rbp)
.L2:
cmpl $100, -4(%rbp)
jg .L3
cmpl $4, -4(%rbp)
jle .L4
movl -4(%rbp), %esi
movl $.LC0, %edi
movl $0, %eax
# call printf
.L4:
leaq -4(%rbp), %rax ---------++
incl (%rax) ---------++
# incl -4(%rbp) ---------**
jmp .L2
For small trip counts, it is strangely better to do the
** version than the
++ method! Why would having the extra code matter?
| iterations |
++ method cycles |
** method cycles |
| 10 |
1206 |
920 |
| 100 |
2061 |
1692 |
| 1000 |
9100 |
8850 |
| 10,000 |
81,000 |
80,900 |
| 100,000 |
801,135 |
800,900 |
Notes on -O3
-
gcc and icc do xorl %eax %eax to set the register to 0
-
gcc and icc stop using the stack and use registers instead for the loop variable
My hacked asm program
Showing how to hack in
rdtsc
.file "frame.c"
.section .rodata
.LC0:
.string "%d "
.text
.globl main
.type main, @function
main:
.LFB2:
pushq %rbp
.LCFI0:
movq %rsp, %rbp
.LCFI1:
subq $16, %rsp
.LCFI2:
cpuid
rdtsc
movq %rax, -8(%rbp)
movl $0, -4(%rbp)
.L2:
cmpl $100000, -4(%rbp)
jg .L3
cmpl $4, -4(%rbp)
jle .L4
movl -4(%rbp), %esi
movl $.LC0, %edi
movl $0, %eax
# call printf
.L4:
# movq %rbp, %rax
# addq $-4, %rax
# leaq -4(%rbp), %rax
# incl (%rax)
incl -4(%rbp)
jmp .L2
.L3:
cpuid
rdtsc
subq %rax, -8(%rbp)
negq -8(%rbp)
movl -8(%rbp), %esi
movl $.LC0, %edi
movl $0, %eax
call printf
leave
ret
.LFE2:
.size main, .-main
.section .eh_frame,"a",@progbits
.Lframe1:
.long .LECIE1-.LSCIE1
.LSCIE1:
.long 0x0
.byte 0x1
.string ""
.uleb128 0x1
.sleb128 -8
.byte 0x10
.byte 0xc
.uleb128 0x7
.uleb128 0x8
.byte 0x90
.uleb128 0x1
.align 8
.LECIE1:
.LSFDE1:
.long .LEFDE1-.LASFDE1
.LASFDE1:
.long .LASFDE1-.Lframe1
.quad .LFB2
.quad .LFE2-.LFB2
.byte 0x4
.long .LCFI0-.LFB2
.byte 0xe
.uleb128 0x10
.byte 0x86
.uleb128 0x2
.byte 0x4
.long .LCFI1-.LCFI0
.byte 0xd
.uleb128 0x6
.align 8
.LEFDE1:
.section .note.GNU-stack,"",@progbits
.ident "GCC: (GNU) 3.4.3 20041212 (Red Hat 3.4.3-9.EL4)"
--
MattWalsh - 20 Dec 2005