// Inferno's libkern/memmove-arm.s
// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-arm.s
//
// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved.
// Portions Copyright 2009 The Go Authors. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include "textflag.h"
// TE or TS are spilled to the stack during bulk register moves.
#define TS R0
#define TE R8
// Warning: the linker will use R11 to synthesize certain instructions. Please
// take care and double check with objdump.
#define FROM R11
#define N R12
#define TMP R12 /* N and TMP don't overlap */
#define TMP1 R5
#define RSHIFT R5
#define LSHIFT R6
#define OFFSET R7
#define BR0 R0 /* shared with TS */
#define BW0 R1
#define BR1 R1
#define BW1 R2
#define BR2 R2
#define BW2 R3
#define BR3 R3
#define BW3 R4
#define FW0 R1
#define FR0 R2
#define FW1 R2
#define FR1 R3
#define FW2 R3
#define FR2 R4
#define FW3 R4
#define FR3 R8 /* shared with TE */
// func memmove(to, from unsafe.Pointer, n uintptr)
TEXT runtime·memmove(SB), NOSPLIT, $4-12
_memmove:
MOVW to+0(FP), TS
MOVW from+4(FP), FROM
MOVW n+8(FP), N
ADD N, TS, TE /* to end pointer */
CMP FROM, TS
BLS _forward
_back:
ADD N, FROM /* from end pointer */
CMP $4, N /* need at least 4 bytes to copy */
BLT _b1tail
_b4align: /* align destination on 4 */
AND.S $3, TE, TMP
BEQ _b4aligned
MOVBU.W -1(FROM), TMP /* pre-indexed */
MOVBU.W TMP, -1(TE) /* pre-indexed */
B _b4align
_b4aligned: /* is source now aligned? */
AND.S $3, FROM, TMP
BNE _bunaligned
ADD $31, TS, TMP /* do 32-byte chunks if possible */
MOVW TS, savedts-4(SP)
_b32loop:
CMP TMP, TE
BLS _b4tail
MOVM.DB.W (FROM), [R0-R7]
MOVM.DB.W [R0-R7], (TE)
B _b32loop
_b4tail: /* do remaining words if possible */
MOVW savedts-4(SP), TS
ADD $3, TS, TMP
_b4loop:
CMP TMP, TE
BLS _b1tail
MOVW.W -4(FROM), TMP1 /* pre-indexed */
MOVW.W TMP1, -4(TE) /* pre-indexed */
B _b4loop
_b1tail: /* remaining bytes */
CMP TE, TS
BEQ _return
MOVBU.W -1(FROM), TMP /* pre-indexed */
MOVBU.W TMP, -1(TE) /* pre-indexed */
B _b1tail
_forward:
CMP $4, N /* need at least 4 bytes to copy */
BLT _f1tail
_f4align: /* align destination on 4 */
AND.S $3, TS, TMP
BEQ _f4aligned
MOVBU.P 1(FROM), TMP /* implicit write back */
MOVBU.P TMP, 1(TS) /* implicit write back */
B _f4align
_f4aligned: /* is source now aligned? */
AND.S $3, FROM, TMP
BNE _funaligned
SUB $31, TE, TMP /* do 32-byte chunks if possible */
MOVW TE, savedte-4(SP)
_f32loop:
CMP TMP, TS
BHS _f4tail
MOVM.IA.W (FROM), [R1-R8]
MOVM.IA.W [R1-R8], (TS)
B _f32loop
_f4tail:
MOVW savedte-4(SP), TE
SUB $3, TE, TMP /* do remaining words if possible */
_f4loop:
CMP TMP, TS
BHS _f1tail
MOVW.P 4(FROM), TMP1 /* implicit write back */
MOVW.P TMP1, 4(TS) /* implicit write back */
B _f4loop
_f1tail:
CMP TS, TE
BEQ _return
MOVBU.P 1(FROM), TMP /* implicit write back */
MOVBU.P TMP, 1(TS) /* implicit write back */
B _f1tail
_return:
MOVW to+0(FP), R0
RET
_bunaligned:
CMP $2, TMP /* is TMP < 2 ? */
MOVW.LT $8, RSHIFT /* (R(n)<<24)|(R(n-1)>>8) */
MOVW.LT $24, LSHIFT
MOVW.LT $1, OFFSET
MOVW.EQ $16, RSHIFT /* (R(n)<<16)|(R(n-1)>>16) */
MOVW.EQ $16, LSHIFT
MOVW.EQ $2, OFFSET
MOVW.GT $24, RSHIFT /* (R(n)<<8)|(R(n-1)>>24) */
MOVW.GT $8, LSHIFT
MOVW.GT $3, OFFSET
ADD $16, TS, TMP /* do 16-byte chunks if possible */
CMP TMP, TE
BLS _b1tail
BIC $3, FROM /* align source */
MOVW TS, savedts-4(SP)
MOVW (FROM), BR0 /* prime first block register */
_bu16loop:
CMP TMP, TE
BLS _bu1tail
MOVW BR0<<LSHIFT, BW3
MOVM.DB.W (FROM), [BR0-BR3]
ORR BR3>>RSHIFT, BW3
MOVW BR3<<LSHIFT, BW2
ORR BR2>>RSHIFT, BW2
MOVW BR2<<LSHIFT, BW1
ORR BR1>>RSHIFT, BW1
MOVW BR1<<LSHIFT, BW0
ORR BR0>>RSHIFT, BW0
MOVM.DB.W [BW0-BW3], (TE)
B _bu16loop
_bu1tail:
MOVW savedts-4(SP), TS
ADD OFFSET, FROM
B _b1tail
_funaligned:
CMP $2, TMP
MOVW.LT $8, RSHIFT /* (R(n+1)<<24)|(R(n)>>8) */
MOVW.LT $24, LSHIFT
MOVW.LT $3, OFFSET
MOVW.EQ $16, RSHIFT /* (R(n+1)<<16)|(R(n)>>16) */
MOVW.EQ $16, LSHIFT
MOVW.EQ $2, OFFSET
MOVW.GT $24, RSHIFT /* (R(n+1)<<8)|(R(n)>>24) */
MOVW.GT $8, LSHIFT
MOVW.GT $1, OFFSET
SUB $16, TE, TMP /* do 16-byte chunks if possible */
CMP TMP, TS
BHS _f1tail
BIC $3, FROM /* align source */
MOVW TE, savedte-4(SP)
MOVW.P 4(FROM), FR3 /* prime last block register, implicit write back */
_fu16loop:
CMP TMP, TS
BHS _fu1tail
MOVW FR3>>RSHIFT, FW0
MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3]
ORR FR0<<LSHIFT, FW0
MOVW FR0>>RSHIFT, FW1
ORR FR1<<LSHIFT, FW1
MOVW FR1>>RSHIFT, FW2
ORR FR2<<LSHIFT, FW2
MOVW FR2>>RSHIFT, FW3
ORR FR3<<LSHIFT, FW3
MOVM.IA.W [FW0,FW1,FW2,FW3], (TS)
B _fu16loop
_fu1tail:
MOVW savedte-4(SP), TE
SUB OFFSET, FROM
B _f1tail
|