android_kernel_xiaomi_sm8350/arch/cris/arch-v10/lib/memset.c

/*#************************************************************************#*/
/*#-------------------------------------------------------------------------*/
/*#                                                                         */
/*# FUNCTION NAME: memset()                                                 */
/*#                                                                         */
/*# PARAMETERS:  void* dst;   Destination address.                          */
/*#              int     c;   Value of byte to write.                       */
/*#              int   len;   Number of bytes to write.                     */
/*#                                                                         */
/*# RETURNS:     dst.                                                       */
/*#                                                                         */
/*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */
/*#              Framework taken from memcpy.  This routine is              */
/*#              very sensitive to compiler changes in register allocation. */
/*#              Should really be rewritten to avoid this problem.          */
/*#                                                                         */
/*#-------------------------------------------------------------------------*/
/*#                                                                         */
/*# HISTORY                                                                 */
/*#                                                                         */
/*# DATE      NAME            CHANGES                                       */
/*# ----      ----            -------                                       */
/*# 990713    HP              Tired of watching this function (or           */
/*#                           really, the nonoptimized generic              */
/*#                           implementation) take up 90% of simulator      */
/*#                           output.  Measurements needed.                 */
/*#                                                                         */
/*#-------------------------------------------------------------------------*/

#include <linux/types.h>

/* No, there's no macro saying 12*4, since it is "hard" to get it into
   the asm in a good way.  Thus better to expose the problem everywhere.
   */

/* Assuming 1 cycle per dword written or read (ok, not really true), and
   one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
   so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */

#define ZERO_BLOCK_SIZE (1*12*4)

void *memset(void *pdst,
             int c,
             size_t plen)
{
  /* Ok.  Now we want the parameters put in special registers.
     Make sure the compiler is able to make something useful of this. */

  register char *return_dst __asm__ ("r10") = pdst;
  register int n __asm__ ("r12") = plen;
  register int lc __asm__ ("r11") = c;

  /* Most apps use memset sanely.  Only those memsetting about 3..4
     bytes or less get penalized compared to the generic implementation
     - and that's not really sane use. */

  /* Ugh.  This is fragile at best.  Check with newer GCC releases, if
     they compile cascaded "x |= x << 8" sanely! */
  __asm__("movu.b %0,$r13\n\t"
          "lslq 8,$r13\n\t"
	  "move.b %0,$r13\n\t"
	  "move.d $r13,%0\n\t"
	  "lslq 16,$r13\n\t"
	  "or.d $r13,%0"
          : "=r" (lc) : "0" (lc) : "r13");

  {
    register char *dst __asm__ ("r13") = pdst;
 
  /* This is NONPORTABLE, but since this whole routine is     */
  /* grossly nonportable that doesn't matter.                 */

  if (((unsigned long) pdst & 3) != 0
     /* Oops! n=0 must be a legal call, regardless of alignment. */
      && n >= 3)
  {
    if ((unsigned long)dst & 1)
    {
      *dst = (char) lc;
      n--;
      dst++;
    }

    if ((unsigned long)dst & 2)
    {
      *(short *)dst = lc;
      n -= 2;
      dst += 2;
    }
  }

  /* Now the fun part.  For the threshold value of this, check the equation
     above. */
  /* Decide which copying method to use. */
  if (n >= ZERO_BLOCK_SIZE)
  {
    /* For large copies we use 'movem' */

  /* It is not optimal to tell the compiler about clobbering any
     registers; that will move the saving/restoring of those registers
     to the function prologue/epilogue, and make non-movem sizes
     suboptimal.

      This method is not foolproof; it assumes that the "asm reg"
     declarations at the beginning of the function really are used
     here (beware: they may be moved to temporary registers).
      This way, we do not have to save/move the registers around into
     temporaries; we can safely use them straight away.

      If you want to check that the allocation was right; then
      check the equalities in the first comment.  It should say
      "r13=r13, r12=r12, r11=r11" */
    __asm__ volatile ("
        ;; Check that the following is true (same register names on
        ;; both sides of equal sign, as in r8=r8):
        ;; %0=r13, %1=r12, %4=r11
        ;;
	;; Save the registers we'll clobber in the movem process
	;; on the stack.  Don't mention them to gcc, it will only be
	;; upset.
	subq 	11*4,$sp
        movem   $r10,[$sp]

        move.d  $r11,$r0
        move.d  $r11,$r1
        move.d  $r11,$r2
        move.d  $r11,$r3
        move.d  $r11,$r4
        move.d  $r11,$r5
        move.d  $r11,$r6
        move.d  $r11,$r7
        move.d  $r11,$r8
        move.d  $r11,$r9
        move.d  $r11,$r10

        ;; Now we've got this:
	;; r13 - dst
	;; r12 - n
	
        ;; Update n for the first loop
        subq    12*4,$r12
0:
        subq   12*4,$r12
        bge     0b
	movem	$r11,[$r13+]

        addq   12*4,$r12  ;; compensate for last loop underflowing n

	;; Restore registers from stack
        movem [$sp+],$r10" 

     /* Outputs */ : "=r" (dst), "=r" (n)
     /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
    
  }

    /* Either we directly starts copying, using dword copying
       in a loop, or we copy as much as possible with 'movem' 
       and then the last block (<44 bytes) is copied here.
       This will work since 'movem' will have updated src,dst,n. */

    while ( n >= 16 )
    {
      *((long*)dst)++ = lc;
      *((long*)dst)++ = lc;
      *((long*)dst)++ = lc;
      *((long*)dst)++ = lc;
      n -= 16;
    }

    /* A switch() is definitely the fastest although it takes a LOT of code.
     * Particularly if you inline code this.
     */
    switch (n)
    {
      case 0:
        break;
      case 1:
        *(char*)dst = (char) lc;
        break;
      case 2:
        *(short*)dst = (short) lc;
        break;
      case 3:
        *((short*)dst)++ = (short) lc;
        *(char*)dst = (char) lc;
        break;
      case 4:
        *((long*)dst)++ = lc;
        break;
      case 5:
        *((long*)dst)++ = lc;
        *(char*)dst = (char) lc;
        break;
      case 6:
        *((long*)dst)++ = lc;
        *(short*)dst = (short) lc;
        break;
      case 7:
        *((long*)dst)++ = lc;
        *((short*)dst)++ = (short) lc;
        *(char*)dst = (char) lc;
        break;
      case 8:
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        break;
      case 9:
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        *(char*)dst = (char) lc;
        break;
      case 10:
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        *(short*)dst = (short) lc;
        break;
      case 11:
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        *((short*)dst)++ = (short) lc;
        *(char*)dst = (char) lc;
        break;
      case 12:
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        break;
      case 13:
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        *(char*)dst = (char) lc;
        break;
      case 14:
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        *(short*)dst = (short) lc;
        break;
      case 15:
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        *((long*)dst)++ = lc;
        *((short*)dst)++ = (short) lc;
        *(char*)dst = (char) lc;
        break;
    }
  }

  return return_dst; /* destination pointer. */
} /* memset() */
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 18:20:36 -04:00			`/#**********************************************************************#/`
			`/#-------------------------------------------------------------------------/`
			`/# /`
			`/# FUNCTION NAME: memset() /`
			`/# /`
			`/# PARAMETERS: void dst; Destination address. */`
			`/# int c; Value of byte to write. /`
			`/# int len; Number of bytes to write. /`
			`/# /`
			`/# RETURNS: dst. /`
			`/# /`
			`/# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. /`
			`/# Framework taken from memcpy. This routine is /`
			`/# very sensitive to compiler changes in register allocation. /`
			`/# Should really be rewritten to avoid this problem. /`
			`/# /`
			`/#-------------------------------------------------------------------------/`
			`/# /`
			`/# HISTORY /`
			`/# /`
			`/# DATE NAME CHANGES /`
			`/# ---- ---- ------- /`
			`/# 990713 HP Tired of watching this function (or /`
			`/# really, the nonoptimized generic /`
			`/# implementation) take up 90% of simulator /`
			`/# output. Measurements needed. /`
			`/# /`
			`/#-------------------------------------------------------------------------/`

			`#include <linux/types.h>`

			`/* No, there's no macro saying 12*4, since it is "hard" to get it into`
			`the asm in a good way. Thus better to expose the problem everywhere.`
			`*/`

			`/* Assuming 1 cycle per dword written or read (ok, not really true), and`
			`one per instruction, then 43+3(n/48-1) <= 24+24(n/48-1)`
			`so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */`

			`#define ZERO_BLOCK_SIZE (1124)`

			`void memset(void pdst,`
			`int c,`
			`size_t plen)`
			`{`
			`/* Ok. Now we want the parameters put in special registers.`
			`Make sure the compiler is able to make something useful of this. */`

			`register char *return_dst __asm__ ("r10") = pdst;`
			`register int n __asm__ ("r12") = plen;`
			`register int lc __asm__ ("r11") = c;`

			`/* Most apps use memset sanely. Only those memsetting about 3..4`
			`bytes or less get penalized compared to the generic implementation`
			`- and that's not really sane use. */`

			`/* Ugh. This is fragile at best. Check with newer GCC releases, if`
			`they compile cascaded "x \|= x << 8" sanely! */`
			`__asm__("movu.b %0,$r13\n\t"`
			`"lslq 8,$r13\n\t"`
			`"move.b %0,$r13\n\t"`
			`"move.d $r13,%0\n\t"`
			`"lslq 16,$r13\n\t"`
			`"or.d $r13,%0"`
			`: "=r" (lc) : "0" (lc) : "r13");`

			`{`
			`register char *dst __asm__ ("r13") = pdst;`

			`/* This is NONPORTABLE, but since this whole routine is */`
			`/* grossly nonportable that doesn't matter. */`

			`if (((unsigned long) pdst & 3) != 0`
			`/* Oops! n=0 must be a legal call, regardless of alignment. */`
			`&& n >= 3)`
			`{`
			`if ((unsigned long)dst & 1)`
			`{`
			`*dst = (char) lc;`
			`n--;`
			`dst++;`
			`}`

			`if ((unsigned long)dst & 2)`
			`{`
			`(short )dst = lc;`
			`n -= 2;`
			`dst += 2;`
			`}`
			`}`

			`/* Now the fun part. For the threshold value of this, check the equation`
			`above. */`
			`/* Decide which copying method to use. */`
			`if (n >= ZERO_BLOCK_SIZE)`
			`{`
			`/* For large copies we use 'movem' */`

			`/* It is not optimal to tell the compiler about clobbering any`
			`registers; that will move the saving/restoring of those registers`
			`to the function prologue/epilogue, and make non-movem sizes`
			`suboptimal.`

			`This method is not foolproof; it assumes that the "asm reg"`
			`declarations at the beginning of the function really are used`
			`here (beware: they may be moved to temporary registers).`
			`This way, we do not have to save/move the registers around into`
			`temporaries; we can safely use them straight away.`

			`If you want to check that the allocation was right; then`
			`check the equalities in the first comment. It should say`
			`"r13=r13, r12=r12, r11=r11" */`
			`__asm__ volatile ("`
			`;; Check that the following is true (same register names on`
			`;; both sides of equal sign, as in r8=r8):`
			`;; %0=r13, %1=r12, %4=r11`
			`;;`
			`;; Save the registers we'll clobber in the movem process`
			`;; on the stack. Don't mention them to gcc, it will only be`
			`;; upset.`
			`subq 11*4,$sp`
			`movem $r10,[$sp]`

			`move.d $r11,$r0`
			`move.d $r11,$r1`
			`move.d $r11,$r2`
			`move.d $r11,$r3`
			`move.d $r11,$r4`
			`move.d $r11,$r5`
			`move.d $r11,$r6`
			`move.d $r11,$r7`
			`move.d $r11,$r8`
			`move.d $r11,$r9`
			`move.d $r11,$r10`

			`;; Now we've got this:`
			`;; r13 - dst`
			`;; r12 - n`

			`;; Update n for the first loop`
			`subq 12*4,$r12`
			`0:`
			`subq 12*4,$r12`
			`bge 0b`
			`movem $r11,[$r13+]`

			`addq 12*4,$r12 ;; compensate for last loop underflowing n`

			`;; Restore registers from stack`
			`movem [$sp+],$r10"`

			`/* Outputs */ : "=r" (dst), "=r" (n)`
			`/* Inputs */ : "0" (dst), "1" (n), "r" (lc));`

			`}`

			`/* Either we directly starts copying, using dword copying`
			`in a loop, or we copy as much as possible with 'movem'`
			`and then the last block (<44 bytes) is copied here.`
			`This will work since 'movem' will have updated src,dst,n. */`

			`while ( n >= 16 )`
			`{`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`n -= 16;`
			`}`

			`/* A switch() is definitely the fastest although it takes a LOT of code.`
			`* Particularly if you inline code this.`
			`*/`
			`switch (n)`
			`{`
			`case 0:`
			`break;`
			`case 1:`
			`(char)dst = (char) lc;`
			`break;`
			`case 2:`
			`(short)dst = (short) lc;`
			`break;`
			`case 3:`
			`((short)dst)++ = (short) lc;`
			`(char)dst = (char) lc;`
			`break;`
			`case 4:`
			`((long)dst)++ = lc;`
			`break;`
			`case 5:`
			`((long)dst)++ = lc;`
			`(char)dst = (char) lc;`
			`break;`
			`case 6:`
			`((long)dst)++ = lc;`
			`(short)dst = (short) lc;`
			`break;`
			`case 7:`
			`((long)dst)++ = lc;`
			`((short)dst)++ = (short) lc;`
			`(char)dst = (char) lc;`
			`break;`
			`case 8:`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`break;`
			`case 9:`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`(char)dst = (char) lc;`
			`break;`
			`case 10:`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`(short)dst = (short) lc;`
			`break;`
			`case 11:`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`((short)dst)++ = (short) lc;`
			`(char)dst = (char) lc;`
			`break;`
			`case 12:`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`break;`
			`case 13:`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`(char)dst = (char) lc;`
			`break;`
			`case 14:`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`(short)dst = (short) lc;`
			`break;`
			`case 15:`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`((long)dst)++ = lc;`
			`((short)dst)++ = (short) lc;`
			`(char)dst = (char) lc;`
			`break;`
			`}`
			`}`

			`return return_dst; /* destination pointer. */`
			`} /* memset() */`