powerpc: Fix ABIv2 issues with stack offsets in assembly code
[firefly-linux-kernel-4.4.55.git] / arch / powerpc / lib / memcpy_64.S
1 /*
2  * Copyright (C) 2002 Paul Mackerras, IBM Corp.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version
7  * 2 of the License, or (at your option) any later version.
8  */
9 #include <asm/processor.h>
10 #include <asm/ppc_asm.h>
11
12         .align  7
13 _GLOBAL(memcpy)
14 BEGIN_FTR_SECTION
15         std     r3,STK_PARAM(R3)(r1)    /* save destination pointer for return value */
16 FTR_SECTION_ELSE
17 #ifndef SELFTEST
18         b       memcpy_power7
19 #endif
20 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
21         PPC_MTOCRF(0x01,r5)
22         cmpldi  cr1,r5,16
23         neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
24         andi.   r6,r6,7
25         dcbt    0,r4
26         blt     cr1,.Lshort_copy
27 /* Below we want to nop out the bne if we're on a CPU that has the
28    CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
29    cleared.
30    At the time of writing the only CPU that has this combination of bits
31    set is Power6. */
32 BEGIN_FTR_SECTION
33         nop
34 FTR_SECTION_ELSE
35         bne     .Ldst_unaligned
36 ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
37                     CPU_FTR_UNALIGNED_LD_STD)
38 .Ldst_aligned:
39         addi    r3,r3,-16
40 BEGIN_FTR_SECTION
41         andi.   r0,r4,7
42         bne     .Lsrc_unaligned
43 END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
44         srdi    r7,r5,4
45         ld      r9,0(r4)
46         addi    r4,r4,-8
47         mtctr   r7
48         andi.   r5,r5,7
49         bf      cr7*4+0,2f
50         addi    r3,r3,8
51         addi    r4,r4,8
52         mr      r8,r9
53         blt     cr1,3f
54 1:      ld      r9,8(r4)
55         std     r8,8(r3)
56 2:      ldu     r8,16(r4)
57         stdu    r9,16(r3)
58         bdnz    1b
59 3:      std     r8,8(r3)
60         beq     3f
61         addi    r3,r3,16
62 .Ldo_tail:
63         bf      cr7*4+1,1f
64         lwz     r9,8(r4)
65         addi    r4,r4,4
66         stw     r9,0(r3)
67         addi    r3,r3,4
68 1:      bf      cr7*4+2,2f
69         lhz     r9,8(r4)
70         addi    r4,r4,2
71         sth     r9,0(r3)
72         addi    r3,r3,2
73 2:      bf      cr7*4+3,3f
74         lbz     r9,8(r4)
75         stb     r9,0(r3)
76 3:      ld      r3,STK_PARAM(R3)(r1)    /* return dest pointer */
77         blr
78
79 .Lsrc_unaligned:
80         srdi    r6,r5,3
81         addi    r5,r5,-16
82         subf    r4,r0,r4
83         srdi    r7,r5,4
84         sldi    r10,r0,3
85         cmpdi   cr6,r6,3
86         andi.   r5,r5,7
87         mtctr   r7
88         subfic  r11,r10,64
89         add     r5,r5,r0
90
91         bt      cr7*4+0,0f
92
93         ld      r9,0(r4)        # 3+2n loads, 2+2n stores
94         ld      r0,8(r4)
95         sld     r6,r9,r10
96         ldu     r9,16(r4)
97         srd     r7,r0,r11
98         sld     r8,r0,r10
99         or      r7,r7,r6
100         blt     cr6,4f
101         ld      r0,8(r4)
102         # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
103         b       2f
104
105 0:      ld      r0,0(r4)        # 4+2n loads, 3+2n stores
106         ldu     r9,8(r4)
107         sld     r8,r0,r10
108         addi    r3,r3,-8
109         blt     cr6,5f
110         ld      r0,8(r4)
111         srd     r12,r9,r11
112         sld     r6,r9,r10
113         ldu     r9,16(r4)
114         or      r12,r8,r12
115         srd     r7,r0,r11
116         sld     r8,r0,r10
117         addi    r3,r3,16
118         beq     cr6,3f
119
120         # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
121 1:      or      r7,r7,r6
122         ld      r0,8(r4)
123         std     r12,8(r3)
124 2:      srd     r12,r9,r11
125         sld     r6,r9,r10
126         ldu     r9,16(r4)
127         or      r12,r8,r12
128         stdu    r7,16(r3)
129         srd     r7,r0,r11
130         sld     r8,r0,r10
131         bdnz    1b
132
133 3:      std     r12,8(r3)
134         or      r7,r7,r6
135 4:      std     r7,16(r3)
136 5:      srd     r12,r9,r11
137         or      r12,r8,r12
138         std     r12,24(r3)
139         beq     4f
140         cmpwi   cr1,r5,8
141         addi    r3,r3,32
142         sld     r9,r9,r10
143         ble     cr1,6f
144         ld      r0,8(r4)
145         srd     r7,r0,r11
146         or      r9,r7,r9
147 6:
148         bf      cr7*4+1,1f
149         rotldi  r9,r9,32
150         stw     r9,0(r3)
151         addi    r3,r3,4
152 1:      bf      cr7*4+2,2f
153         rotldi  r9,r9,16
154         sth     r9,0(r3)
155         addi    r3,r3,2
156 2:      bf      cr7*4+3,3f
157         rotldi  r9,r9,8
158         stb     r9,0(r3)
159 3:      ld      r3,STK_PARAM(R3)(r1)    /* return dest pointer */
160         blr
161
162 .Ldst_unaligned:
163         PPC_MTOCRF(0x01,r6)             # put #bytes to 8B bdry into cr7
164         subf    r5,r6,r5
165         li      r7,0
166         cmpldi  cr1,r5,16
167         bf      cr7*4+3,1f
168         lbz     r0,0(r4)
169         stb     r0,0(r3)
170         addi    r7,r7,1
171 1:      bf      cr7*4+2,2f
172         lhzx    r0,r7,r4
173         sthx    r0,r7,r3
174         addi    r7,r7,2
175 2:      bf      cr7*4+1,3f
176         lwzx    r0,r7,r4
177         stwx    r0,r7,r3
178 3:      PPC_MTOCRF(0x01,r5)
179         add     r4,r6,r4
180         add     r3,r6,r3
181         b       .Ldst_aligned
182
183 .Lshort_copy:
184         bf      cr7*4+0,1f
185         lwz     r0,0(r4)
186         lwz     r9,4(r4)
187         addi    r4,r4,8
188         stw     r0,0(r3)
189         stw     r9,4(r3)
190         addi    r3,r3,8
191 1:      bf      cr7*4+1,2f
192         lwz     r0,0(r4)
193         addi    r4,r4,4
194         stw     r0,0(r3)
195         addi    r3,r3,4
196 2:      bf      cr7*4+2,3f
197         lhz     r0,0(r4)
198         addi    r4,r4,2
199         sth     r0,0(r3)
200         addi    r3,r3,2
201 3:      bf      cr7*4+3,4f
202         lbz     r0,0(r4)
203         stb     r0,0(r3)
204 4:      ld      r3,STK_PARAM(R3)(r1)    /* return dest pointer */
205         blr