IA-64 : System Calls
I (=Arun) was examining the consequences of using ar.k5 as the system call entry point.
Key findings:
- Latency of reading ar.k5 is 12 cycles as opposed to 1 cycles for other GRs
- Using an immediate value or loading it from memory saves about 11 cycles.
test.c
<pre> #include <stdio.h>
#define COUNT 10000 typedef unsigned long uint64_t;
static inline uint64_t ia64_get_itc() {
- uint64_t res;
asm volatile ("mov %0=ar.itc" : "=r" (res)); return res;
}
static inline uint64_t ia64_get_k5() {
- uint64_t res;
asm volatile ("mov %0=ar.k5" : "=r" (res)); return res;
}
static inline uint64_t ia64_get_r1() {
- uint64_t res;
asm volatile ("mov %0=r1" : "=r" (res)); return res;
}
main() {
- uint64_t t1, t2, i; t1 = ia64_get_itc();
for (i = 0; i < COUNT; i++)
- ia64_get_k5();
for (i = 0; i < COUNT; i++)
- ia64_get_r1();
for (i = 0; i < COUNT; i++)
- getpid1();
for (i = 0; i < COUNT; i++)
- getpid2();
for (i = 0; i < COUNT; i++)
- getpid3();
} </pre>
t.S
<pre> #include <machine/asm.h>
entry: data8 0xa000000000000000
.text
_error:
- mov r14=ip ;; mov b0=r14 br.cond.sptk.few b0
ENTRY(getpid1, 8)
- alloc r9=ar.pfs,8,0,0,0 mov.m r31=ar.k5 mov r10=b0;; mov r8=20 mov b7=r31 br.call.sptk.few b0=b7;; cmp.eq p0,p6=r0,r10
(p6) br.cond.sptk.few _error;
- br.ret.sptk.few b0;;
.endp
ENTRY(getpid2, 8)
- alloc r9=ar.pfs,8,0,0,0 movl r31=0xa000000000000000 mov r10=b0;; mov r8=20 mov b7=r31 br.call.sptk.few b0=b7;; cmp.eq p0,p6=r0,r10
(p6) br.cond.sptk.few _error
- br.ret.sptk.few b0;;
.endp
ENTRY(getpid3, 8)
- alloc r9=ar.pfs,8,0,0,0 addl r14 = @gprel(entry#), gp;; ld8 r31 = [r14] mov r10=b0;; mov r8=20 mov b7=r31 br.call.sptk.few b0=b7;; cmp.eq p0,p6=r0,r10
(p6) br.cond.sptk.few _error
- br.ret.sptk.few b0;;
.endp </pre>
Results
<pre> $ gcc -O2 test.c t.S -o t $ ./t 12 1 433 422 422 </pre>