1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
| | /* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2013 Regents of the University of California
*/
#define SZREG 8
#define REG_S sd
.global memset
.type memset,@function
memset:
move t0, a0 /* Preserve return value */
/* Defer to byte-oriented fill for small sizes */
sltiu a3, a2, 16
bnez a3, 4f
/*
* Round to nearest XLEN-aligned address
* greater than or equal to start address
*/
addi a3, t0, SZREG-1
andi a3, a3, ~(SZREG-1)
beq a3, t0, 2f /* Skip if already aligned */
/* Handle initial misalignment */
sub a4, a3, t0
1:
sb a1, 0(t0)
addi t0, t0, 1
bltu t0, a3, 1b
sub a2, a2, a4 /* Update count */
2:
andi a1, a1, 0xff
slli a3, a1, 8
or a1, a3, a1
slli a3, a1, 16
or a1, a3, a1
slli a3, a1, 32
or a1, a3, a1
/* Calculate end address */
andi a4, a2, ~(SZREG-1)
add a3, t0, a4
andi a4, a4, 31*SZREG /* Calculate remainder */
beqz a4, 3f /* Shortcut if no remainder */
neg a4, a4
addi a4, a4, 32*SZREG /* Calculate initial offset */
/* Adjust start address with offset */
sub t0, t0, a4
/* Jump into loop body */
/* Assumes 64-bit instruction lengths */
la a5, 3f
srli a4, a4, 1
add a5, a5, a4
jr a5
3:
REG_S a1, 0(t0)
REG_S a1, SZREG(t0)
REG_S a1, 2*SZREG(t0)
REG_S a1, 3*SZREG(t0)
REG_S a1, 4*SZREG(t0)
REG_S a1, 5*SZREG(t0)
REG_S a1, 6*SZREG(t0)
REG_S a1, 7*SZREG(t0)
REG_S a1, 8*SZREG(t0)
REG_S a1, 9*SZREG(t0)
REG_S a1, 10*SZREG(t0)
REG_S a1, 11*SZREG(t0)
REG_S a1, 12*SZREG(t0)
REG_S a1, 13*SZREG(t0)
REG_S a1, 14*SZREG(t0)
REG_S a1, 15*SZREG(t0)
REG_S a1, 16*SZREG(t0)
REG_S a1, 17*SZREG(t0)
REG_S a1, 18*SZREG(t0)
REG_S a1, 19*SZREG(t0)
REG_S a1, 20*SZREG(t0)
REG_S a1, 21*SZREG(t0)
REG_S a1, 22*SZREG(t0)
REG_S a1, 23*SZREG(t0)
REG_S a1, 24*SZREG(t0)
REG_S a1, 25*SZREG(t0)
REG_S a1, 26*SZREG(t0)
REG_S a1, 27*SZREG(t0)
REG_S a1, 28*SZREG(t0)
REG_S a1, 29*SZREG(t0)
REG_S a1, 30*SZREG(t0)
REG_S a1, 31*SZREG(t0)
addi t0, t0, 32*SZREG
bltu t0, a3, 3b
andi a2, a2, SZREG-1 /* Update count */
4:
/* Handle trailing misalignment */
beqz a2, 6f
add a3, t0, a2
5:
/* Fill head and tail with minimal branching. Each
* conditional ensures that all the subsequently used
* offsets are well-defined and in the dest region. */
sb a1, 0(t0)
sb a1, -1(a3)
li a4, 2
bgeu a4, a2, 6f
sb a1, 1(t0)
sb a1, 2(t0)
sb a1, -2(a3)
sb a1, -3(a3)
li a4, 6
bgeu a4, a2, 6f
sb a1, 3(t0)
sb a1, -4(a3)
li a4, 8
bgeu a4, a2, 6f
sb a1, 4(t0)
sb a1, 5(t0)
sb a1, -5(a3)
li a4, 11
bgeu a4, a2, 6f
sb a1, 6(t0)
sb a1, -6(a3)
sb a1, -7(a3)
li a4, 14
bgeu a4, a2, 6f
sb a1, 7(t0)
6:
ret
|