-
Notifications
You must be signed in to change notification settings - Fork 0
/
high-load-arm.S
140 lines (115 loc) · 3.29 KB
/
high-load-arm.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/* This file is in large inspired by cpuburn-a7 by
* https://github.com/ssvb/cpuburn-arm
* Copyright © 2013 Siarhei Siamashka <siarhei.siamashka@gmail.com>
*
* Additions for Raspberry Pi by
* Nard Linux SDK
* http://www.arbetsmyra.dyndns.org/nard
* Copyright (C) 2014-2017 Ronny Nilsson
*/
.syntax divided
.section .text
.arm
@-------------------------------------------------------------
@ Power consumer for ARM32 with Neon
.align 2
.func burn_cpu_neon
.type burn_cpu_neon, %function
.global burn_cpu_neon
burn_cpu_neon:
push {r4, r5, fp, lr} @ Prologue
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
add fp, sp, #12
vpush {q4-q5}
@ Create a pointer to code ram
adr r1, pLabels
pld [r1]
add r1, r1, #1
mov r2, #0
/* Create a pointer to data ram, which also
* happens to be the same location as our C
* code global "break out of loop" flag. */
ldr r5, pExit
sub r5, r5, #4
pld [r5]
@ Static Neon data for high workload
vmov.u32 q1, #0
vmov.u32 q2, #0xffffffff
vmov.u32 q4, #0xf0f0f0f0
vmov.u32 q5, #0x0f0f0f0f
/* Tight loop where we alternate reading
* unaligned data from both code and data
* ram, combined with Neon calculations. */
b 1f
.align 7
1: ldr r3, [r5, #1] @ Poll do_exit, time to exit loop?
vabd.u32 q0, q1, q2
ldr r0, [r1, r2, lsl #2]!
vaba.u32 q3, q4, q5
movs r2, r3
beq 1b
movne r0, #0 @ EXIT_SUCCESS
moveq r0, #1 @ EXIT_FAILURE
vpop {q4-q5}
#else
mov r0, #1 @ EXIT_FAILURE
#endif
pop {r4, r5, fp, pc} @ Epilogue
.endfunc
@-------------------------------------------------------------
@ Power consumer for ARM32
.align 2
.func burn_cpu_arm
.type burn_cpu_arm, %function
.global burn_cpu_arm
burn_cpu_arm:
push {r4, r5, r6, r7, fp, lr} @ Prologue
add fp, sp, #12
@ Create a pointer to code ram
adr r1, pLabels
pld [r1]
/* Create a pointer to data ram, which also
* happens to be the same location as our C
* code global "break out of loop" flag. */
ldr r5, pExit
sub r5, r5, #4
pld [r5]
/* Tight low latency optimized loop where
* we alternate reading unaligned data from
* both code and data ram. Two instructions
* per cycle by Cortex-A7 and one by ARM11.
* Code alignment has impact. */
mov r3, #0
b 1f
.align 7
1: ldr r0, [r1, #1]
movs r2, r3
ldr r3, [r5, #1] @ Poll do_exit, time to exit loop?
mov r4, r1
ldr r6, [r1, #1]
mov r2, r1
ldr r7, [r5, #1] @ Poll do_exit, time to exit loop?
beq 1b
movne r0, #0 @ EXIT_SUCCESS
moveq r0, #1 @ EXIT_FAILURE
pop {r4, r5, r6, r7, fp, pc} @ Epilogue
.endfunc
@-------------------------------------------------------------
@ Name to address mapping for global C variables and
@ cache line aligned code ram dummy data.
.align 7
pLabels:.word 0
pExit: .word do_exit
.align 5
.word 0
@-------------------------------------------------------------
.section .bss
.align 7
/* Tell the compiler <do_exit> is one byte
* (as in C src) but reserve eight bytes as
* guard for asm dynamic unaligned access. */
.type do_exit, %object
.size do_exit, 1
.global do_exit
.word 0
do_exit:.word 0