The Program

int main(void)
{
  int i;
  for (i = 2; i < 10; i++)
  {  if (i > 4)
     {   printf("%d ", i);
} }  }

Assembler dump - gcc -O0

Things to note:

  • We store the start of our stack in rbp
  • Then we store our loop variable in this stack space (at rbp - 4)
    • in em64t mode, shorts are 2, ints are 4 bytes, and longs are 8
    • in ia32 mode, shorts are 2, ints are 4 bytes, and longs are 4
  • We shift the stack pointer by 16 bytes. Now, we are not actually using 16 bytes of space - we only need a couple of bytes. But it seems the comiler reserves bytes in 16 byte chunks. It also seems to try and align arrays. That is, if you add int a[3], even though it should pack in and fit in 4 bytes, it goes to 32 a byte reservation. But it still reserves 16 bytes for int a[2].
  • If you go back and make the array index a long, the operations go from ___l to ___q (quad word).
  • Seems strange to increment the counter in 2 steps instead of one ( incl -4(%rbp) )
    • ...and the impact is mysterious (see below)

main:
.LFB2:
    # push quadword 'rbp' on to stack (memory @ rsp)
    # note: if you just do 'push' it seems to choose the apropos instruction
    #  but if you do 'pushb' you get an "Error: suffix or operands invalid for `push'"                          
        pushq   %rbp   

.LCFI0:
        movq    %rsp, %rbp
.LCFI1:
        subq    $16, %rsp
.LCFI2:

    # initialize our counter (i)
        movl    $2, -4(%rbp)
.L2:
        cmpl    $9, -4(%rbp)
        jg      .L3
        cmpl    $4, -4(%rbp)
        jle     .L4
        movl    -4(%rbp), %esi
        movl    $.LC0, %edi
        movl    $0, %eax
        call    printf
.L4:
    # increment the counter
    # load the effective qword address into rax.  That is, we take the value in rbp
    #  and subtract 4 and store it in rax
    # then increment the value stored in memory pointed to by rax    
        leaq    -4(%rbp), %rax
        incl    (%rax)

    # loop back
        jmp     .L2
.L3:
        leave
        ret

cycle count tests - simple loop

I've tried different ways to increment the counter, but they all come out the same with this stripped down program.

        movl    $0, -4(%rbp)
.L2:
        cmpl    $10000, -4(%rbp) # (vary the $10000 for iteration count)
        jg      .L3
        leaq    -4(%rbp), %rax
        incl    (%rax)
        jmp     .L2

iterations cycles
1 180
10 234
100 820
1,000 6200
10,000 60,200
100,000 600,900
1,000,000 6,045,000
10,000,000 60,400,000

cycle count tests - more complex loop

        cpuid
        rdtsc
        movq    %rax, -8(%rbp)

        movl    $0, -4(%rbp)
.L2:
        cmpl    $100, -4(%rbp)
        jg      .L3
        cmpl    $4, -4(%rbp)
        jle     .L4
        movl    -4(%rbp), %esi
        movl    $.LC0, %edi
        movl    $0, %eax
#       call    printf
.L4:

        leaq    -4(%rbp), %rax     ---------++
        incl    (%rax)             ---------++

#       incl    -4(%rbp)           ---------**
        jmp     .L2

For small trip counts, it is strangely better to do the ** version than the ++ method! Why would having the extra code matter?

iterations ++ method cycles ** method cycles
10 1206 920
100 2061 1692
1000 9100 8850
10,000 81,000 80,900
100,000 801,135 800,900

Notes on -O3

  • gcc and icc do xorl %eax %eax to set the register to 0
  • gcc and icc stop using the stack and use registers instead for the loop variable

My hacked asm program

Showing how to hack in rdtsc
        .file   "frame.c"
        .section        .rodata
.LC0:
        .string "%d "
        .text
.globl main
        .type   main, @function
main:
.LFB2:
        pushq   %rbp
.LCFI0:
        movq    %rsp, %rbp
.LCFI1:
        subq    $16, %rsp
.LCFI2:

        cpuid
        rdtsc
        movq    %rax, -8(%rbp)

        movl    $0, -4(%rbp)
.L2:
        cmpl    $100000, -4(%rbp)
        jg      .L3
        cmpl    $4, -4(%rbp)
        jle     .L4
        movl    -4(%rbp), %esi
        movl    $.LC0, %edi
        movl    $0, %eax
#       call    printf
.L4:

#        movq    %rbp, %rax
#        addq    $-4, %rax

#       leaq    -4(%rbp), %rax
#       incl    (%rax)

        incl    -4(%rbp)
        jmp     .L2
.L3:
        cpuid
        rdtsc
        subq    %rax, -8(%rbp)
        negq    -8(%rbp)

        movl    -8(%rbp), %esi
        movl    $.LC0, %edi
        movl    $0, %eax
        call    printf
        leave
        ret
.LFE2:
        .size   main, .-main
        .section        .eh_frame,"a",@progbits
.Lframe1:
        .long   .LECIE1-.LSCIE1
.LSCIE1:
        .long   0x0
        .byte   0x1
        .string ""
        .uleb128 0x1
        .sleb128 -8
        .byte   0x10
        .byte   0xc
        .uleb128 0x7
        .uleb128 0x8
        .byte   0x90
        .uleb128 0x1
        .align 8
.LECIE1:
.LSFDE1:
        .long   .LEFDE1-.LASFDE1
.LASFDE1:
        .long   .LASFDE1-.Lframe1
        .quad   .LFB2
        .quad   .LFE2-.LFB2
        .byte   0x4
        .long   .LCFI0-.LFB2
        .byte   0xe
        .uleb128 0x10
        .byte   0x86
        .uleb128 0x2
        .byte   0x4
        .long   .LCFI1-.LCFI0
        .byte   0xd
        .uleb128 0x6
        .align 8
.LEFDE1:
        .section        .note.GNU-stack,"",@progbits
        .ident  "GCC: (GNU) 3.4.3 20041212 (Red Hat 3.4.3-9.EL4)"

-- MattWalsh - 20 Dec 2005

Topic revision: r1 - 20 Dec 2005 - MattWalsh
 
This site is powered by the TWiki collaboration platformCopyright © 2008-2012 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding TWiki? Send feedback