feat(newlib): riscv: add CONFIG_LIBC_OPTIMIZED_MISALIGNED_ACCESS config option

This option replaces implementations of functions from ROM:
  - memcpy
  - memcmp
  - memmove
  - str[n]cpy
  - str[n]cmp

The functions used in the firmware will be better optimized for misaligned
memory. Here are some measurements in CPU cycles for 4096-byte buffers:

  memcpy:  28676 -> 4128
  memcmp:  49147 -> 14259
  memmove: 33896 -> 8086
  strcpy:  32771 -> 17313
  strcmp:  32775 -> 13191
This commit is contained in:
Alexey Lapshin
2025-03-26 15:18:54 +07:00
parent f3625b0fb0
commit ec68cb3300
46 changed files with 1049 additions and 58 deletions
+62
View File
@@ -0,0 +1,62 @@
/*
* SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc.
*
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0
*
* SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
*/
#include <_ansi.h>
#include <limits.h>
/*
Taken from glibc:
Add the compiler optimization to inhibit loop transformation to library
calls. This is used to avoid recursive calls in memset and memmove
default implementations.
*/
# define __inhibit_loop_to_libcall \
__attribute__ ((__optimize__ ("-fno-tree-loop-distribute-patterns")))
/* Nonzero if X is not aligned on a "long" boundary.
* This macro is used to skip a few bytes to find an aligned pointer.
* It's better to keep it as is even if _HAVE_HW_MISALIGNED_ACCESS is enabled,
* to avoid small performance penalties (if they are not zero). */
#define UNALIGNED_X(X) ((long)X & (sizeof (long) - 1))
#define _HAVE_HW_MISALIGNED_ACCESS (__riscv_misaligned_fast || __riscv_misaligned_slow)
#if _HAVE_HW_MISALIGNED_ACCESS
/* Hardware performs unaligned operations with little
* to no penalty compared to byte-to-byte copy. */
#define UNALIGNED_X_Y(X, Y) (0)
#else /* _HAVE_HW_MISALIGNED_ACCESS */
/* Nonzero if either X or Y is not aligned on a "long" boundary. */
#define UNALIGNED_X_Y(X, Y) \
(((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1)))
#endif /* _HAVE_HW_MISALIGNED_ACCESS */
/* How many bytes are copied each iteration of the word copy loop. */
#define LITTLE_BLOCK_SIZE (sizeof (long))
/* How many bytes are copied each iteration of the 4X unrolled loop. */
#define BIG_BLOCK_SIZE (sizeof (long) << 2)
/* Threshold for punting to the little block byte copier. */
#define TOO_SMALL_LITTLE_BLOCK(LEN) ((LEN) < LITTLE_BLOCK_SIZE)
/* Threshold for punting to the big block byte copier. */
#define TOO_SMALL_BIG_BLOCK(LEN) ((LEN) < BIG_BLOCK_SIZE)
/* Macros for detecting endchar. */
#if LONG_MAX == 2147483647L
#define DETECT_NULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
#else
#if LONG_MAX == 9223372036854775807L
/* Nonzero if X (a long int) contains a NULL byte. */
#define DETECT_NULL(X) (((X) - 0x0101010101010101) & ~(X) & 0x8080808080808080)
#else
#error long int is not a 32bit or 64bit type.
#endif
#endif
/* Returns nonzero if (long)X contains the byte used to fill (long)MASK. */
#define DETECT_CHAR(X, MASK) (DETECT_NULL(X ^ MASK))
+59
View File
@@ -0,0 +1,59 @@
/*
* SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc.
*
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0
*
* SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
*/
#include <string.h>
#include "local.h"
__attribute__((optimize("-Os")))
int
memcmp(const void *m1,
const void *m2,
size_t n)
{
unsigned char *s1 = (unsigned char *) m1;
unsigned char *s2 = (unsigned char *) m2;
unsigned long *a1;
unsigned long *a2;
/* If the size is too small, or either pointer is unaligned,
then we punt to the byte compare loop. Hopefully this will
not turn up in inner loops. */
if (!TOO_SMALL_LITTLE_BLOCK(n) && !UNALIGNED_X_Y(s1, s2)) {
/* Otherwise, load and compare the blocks of memory one
word at a time. */
a1 = (unsigned long*) s1;
a2 = (unsigned long*) s2;
while (!TOO_SMALL_LITTLE_BLOCK(n)) {
if (*a1 != *a2) {
break;
}
a1++;
a2++;
n -= LITTLE_BLOCK_SIZE;
}
/* check m mod LITTLE_BLOCK_SIZE remaining characters */
s1 = (unsigned char*)a1;
s2 = (unsigned char*)a2;
}
while (n--) {
if (*s1 != *s2) {
return *s1 - *s2;
}
s1++;
s2++;
}
return 0;
}
// Hook to force the linker to include this file
void esp_libc_include_memcmp_impl(void)
{
}
+88
View File
@@ -0,0 +1,88 @@
/*
* SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc.
*
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0
*
* SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
*/
#include <string.h>
#include <_ansi.h>
#include <stddef.h>
#include <limits.h>
#include "local.h"
__attribute__((optimize("-Os")))
void *
__inhibit_loop_to_libcall
memmove(void *dst_void,
const void *src_void,
size_t length)
{
char *dst = dst_void;
const char *src = src_void;
long *aligned_dst;
const long *aligned_src;
if (src < dst && dst < src + length) {
/* Destructive overlap...have to copy backwards */
src += length;
dst += length;
if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) {
aligned_dst = (long*)dst;
aligned_src = (long*)src;
/* Copy one long word at a time if possible. */
while (!TOO_SMALL_LITTLE_BLOCK(length)) {
*--aligned_dst = *--aligned_src;
length -= LITTLE_BLOCK_SIZE;
}
/* Pick up any residual with a byte copier. */
dst = (char*)aligned_dst;
src = (char*)aligned_src;
}
while (length--) {
*--dst = *--src;
}
} else {
/* Use optimizing algorithm for a non-destructive copy to closely
match memcpy. If the size is small or either SRC or DST is unaligned,
then punt into the byte copy loop. This should be rare. */
if (!TOO_SMALL_LITTLE_BLOCK(length) && !UNALIGNED_X_Y(src, dst)) {
aligned_dst = (long*)dst;
aligned_src = (long*)src;
/* Copy 4X long words at a time if possible. */
while (!TOO_SMALL_BIG_BLOCK(length)) {
*aligned_dst++ = *aligned_src++;
*aligned_dst++ = *aligned_src++;
*aligned_dst++ = *aligned_src++;
*aligned_dst++ = *aligned_src++;
length -= BIG_BLOCK_SIZE;
}
/* Copy one long word at a time if possible. */
while (!TOO_SMALL_LITTLE_BLOCK(length)) {
*aligned_dst++ = *aligned_src++;
length -= LITTLE_BLOCK_SIZE;
}
/* Pick up any residual with a byte copier. */
dst = (char*)aligned_dst;
src = (char*)aligned_src;
}
while (length--) {
*dst++ = *src++;
}
}
return dst_void;
}
// Hook to force the linker to include this file
void esp_libc_include_memmove_impl(void)
{
}
+63
View File
@@ -0,0 +1,63 @@
/*
* SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc.
*
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0
*
* SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
*/
#include <string.h>
#include <limits.h>
#include "local.h"
__attribute__((optimize("-Os")))
int
strncmp(const char *s1,
const char *s2,
size_t n)
{
unsigned long *a1;
unsigned long *a2;
if (n == 0) {
return 0;
}
/* If s1 or s2 are unaligned, then compare bytes. */
if (!UNALIGNED_X_Y(s1, s2)) {
/* If s1 and s2 are word-aligned, compare them a word at a time. */
a1 = (unsigned long*)s1;
a2 = (unsigned long*)s2;
while (n >= sizeof(long) && *a1 == *a2) {
n -= sizeof(long);
/* If we've run out of bytes or hit a null, return zero
since we already know *a1 == *a2. */
if (n == 0 || DETECT_NULL(*a1)) {
return 0;
}
a1++;
a2++;
}
/* A difference was detected in last few bytes of s1, so search bytewise */
s1 = (char*)a1;
s2 = (char*)a2;
}
while (n-- > 0 && *s1 == *s2) {
/* If we've run out of bytes or hit a null, return zero
since we already know *s1 == *s2. */
if (n == 0 || *s1 == '\0') {
return 0;
}
s1++;
s2++;
}
return (*(unsigned char *) s1) - (*(unsigned char *) s2);
}
// Hook to force the linker to include this file
void esp_libc_include_strncmp_impl(void)
{
}
+56
View File
@@ -0,0 +1,56 @@
/*
* SPDX-FileCopyrightText: 1994-2009 Red Hat, Inc.
*
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND Apache-2.0
*
* SPDX-FileContributor: 2025 Espressif Systems (Shanghai) CO LTD
*/
#include <string.h>
#include <limits.h>
#include "local.h"
__attribute__((optimize("-Os")))
char *
strncpy(char *__restrict dst0,
const char *__restrict src0,
size_t count)
{
char *dst = dst0;
const char *src = src0;
long *aligned_dst;
const long *aligned_src;
/* If SRC and DEST is aligned and count large enough, then copy words. */
if (!UNALIGNED_X_Y(src, dst) && !TOO_SMALL_LITTLE_BLOCK(count)) {
aligned_dst = (long*)dst;
aligned_src = (long*)src;
/* SRC and DEST are both "long int" aligned, try to do "long int"
sized copies. */
while (!TOO_SMALL_LITTLE_BLOCK(count) && !DETECT_NULL(*aligned_src)) {
count -= sizeof(long int);
*aligned_dst++ = *aligned_src++;
}
dst = (char*)aligned_dst;
src = (char*)aligned_src;
}
while (count > 0) {
--count;
if ((*dst++ = *src++) == '\0') {
break;
}
}
while (count-- > 0) {
*dst++ = '\0';
}
return dst0;
}
// Hook to force the linker to include this file
void esp_libc_include_strncpy_impl(void)
{
}