mailing list of musl libc
 help / color / mirror / code / Atom feed
ee59924b18850095f8087fa0ab3619292f9f47ef blob 4407 bytes (raw)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
 
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 Regents of the University of California
 */

#define SZREG 8
#define REG_S sd
#define REG_L ld

.global memcpy
.type memcpy,@function
memcpy:
        /* Save for return value */
        mv      t6, a0

        /*
         * Register allocation for code below:
         * a0 - start of uncopied dst
         * a1 - start of uncopied src
         * t0 - end of uncopied dst
         */
        add     t0, a0, a2

        /*
         * Use bytewise copy if too small.
         *
         * This threshold must be at least 2*SZREG to ensure at least one
         * wordwise copy is performed. It is chosen to be 16 because it will
         * save at least 7 iterations of bytewise copy, which pays off the
         * fixed overhead.
         */
        li      a3, 16
        bltu    a2, a3, .Lbyte_copy_tail

        /*
         * Bytewise copy first to align a0 to word boundary.
         */
        addi    a2, a0, SZREG-1
        andi    a2, a2, ~(SZREG-1)
        beq     a0, a2, 2f
1:
        lb      a5, 0(a1)
        addi    a1, a1, 1
        sb      a5, 0(a0)
        addi    a0, a0, 1
        bne     a0, a2, 1b
2:

        /*
         * Now a0 is word-aligned. If a1 is also word aligned, we could perform
         * aligned word-wise copy. Otherwise we need to perform misaligned
         * word-wise copy.
         */
        andi    a3, a1, SZREG-1
        bnez    a3, .Lmisaligned_word_copy

        /* Unrolled wordwise copy */
        addi    t0, t0, -(16*SZREG-1)
        bgeu    a0, t0, 2f
1:
        REG_L   a2,        0(a1)
        REG_L   a3,    SZREG(a1)
        REG_L   a4,  2*SZREG(a1)
        REG_L   a5,  3*SZREG(a1)
        REG_L   a6,  4*SZREG(a1)
        REG_L   a7,  5*SZREG(a1)
        REG_L   t1,  6*SZREG(a1)
        REG_L   t2,  7*SZREG(a1)
        REG_L   t3,  8*SZREG(a1)
        REG_L   t4,  9*SZREG(a1)
        REG_L   t5, 10*SZREG(a1)
        REG_S   a2,        0(a0)
        REG_S   a3,    SZREG(a0)
        REG_S   a4,  2*SZREG(a0)
        REG_S   a5,  3*SZREG(a0)
        REG_S   a6,  4*SZREG(a0)
        REG_S   a7,  5*SZREG(a0)
        REG_S   t1,  6*SZREG(a0)
        REG_S   t2,  7*SZREG(a0)
        REG_S   t3,  8*SZREG(a0)
        REG_S   t4,  9*SZREG(a0)
        REG_S   t5, 10*SZREG(a0)
        REG_L   a2, 11*SZREG(a1)
        REG_L   a3, 12*SZREG(a1)
        REG_L   a4, 13*SZREG(a1)
        REG_L   a5, 14*SZREG(a1)
        REG_L   a6, 15*SZREG(a1)
        addi    a1, a1, 16*SZREG
        REG_S   a2, 11*SZREG(a0)
        REG_S   a3, 12*SZREG(a0)
        REG_S   a4, 13*SZREG(a0)
        REG_S   a5, 14*SZREG(a0)
        REG_S   a6, 15*SZREG(a0)
        addi    a0, a0, 16*SZREG
        bltu    a0, t0, 1b
2:
        /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
        addi    t0, t0, 15*SZREG

        /* Wordwise copy */
        bgeu    a0, t0, 2f
1:
        REG_L   a5, 0(a1)
        addi    a1, a1, SZREG
        REG_S   a5, 0(a0)
        addi    a0, a0, SZREG
        bltu    a0, t0, 1b
2:
        addi    t0, t0, SZREG-1

.Lbyte_copy_tail:
        /*
         * Bytewise copy anything left.
         */
        beq     a0, t0, 2f
1:
        lb      a5, 0(a1)
        addi    a1, a1, 1
        sb      a5, 0(a0)
        addi    a0, a0, 1
        bne     a0, t0, 1b
2:

        mv      a0, t6
        ret

.Lmisaligned_word_copy:
        /*
         * Misaligned word-wise copy.
         * For misaligned copy we still perform word-wise copy, but we need to
         * use the value fetched from the previous iteration and do some shifts.
         * This is safe because we wouldn't access more words than necessary.
         */

        /* Calculate shifts */
        slli    t3, a3, 3
        sub     t4, x0, t3 /* negate is okay as shift will only look at LSBs */

        /* Load the initial value and align a1 */
        andi    a1, a1, ~(SZREG-1)
        REG_L   a5, 0(a1)

        addi    t0, t0, -(SZREG-1)
        /* At least one iteration will be executed here, no check */
1:
        srl     a4, a5, t3
        REG_L   a5, SZREG(a1)
        addi    a1, a1, SZREG
        sll     a2, a5, t4
        or      a2, a2, a4
        REG_S   a2, 0(a0)
        addi    a0, a0, SZREG
        bltu    a0, t0, 1b

        /* Update pointers to correct value */
        addi    t0, t0, SZREG-1
        add     a1, a1, a3

        j       .Lbyte_copy_tail
debug log:

solving ee59924 ...
found ee59924 in https://inbox.vuxu.org/musl/20230607100710.4286-3-zhang_fei_0403@163.com/

applying [1/1] https://inbox.vuxu.org/musl/20230607100710.4286-3-zhang_fei_0403@163.com/
diff --git a/src/string/riscv64/memcpy.S b/src/string/riscv64/memcpy.S
new file mode 100644
index 0000000..ee59924

Checking patch src/string/riscv64/memcpy.S...
Applied patch src/string/riscv64/memcpy.S cleanly.

index at:
100644 ee59924b18850095f8087fa0ab3619292f9f47ef	src/string/riscv64/memcpy.S

Code repositories for project(s) associated with this public inbox

	https://git.vuxu.org/mirror/musl/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).