diff --git a/core/klibc.c b/core/klibc.c index 1a928e2..e5c9537 100644 --- a/core/klibc.c +++ b/core/klibc.c @@ -16,14 +16,66 @@ int memcmp(const void *aptr, const void *bptr, size_t size) return 0; } -void *memcpy(void *dst, const void *src, size_t n) +// Inspirated by https://interrupt.memfault.com/blog/memcpy-newlib-nano +/* Nonzero if either X or Y is not aligned on a "long" boundary. */ +#define UNALIGNED(X, Y) \ + (((long)X & (sizeof (long) - 1)) | ((long)Y & (sizeof (long) - 1))) + +/* How many bytes are copied each iteration of the 4X unrolled loop. */ +#define BIGBLOCKSIZE (sizeof (long) << 2) + +/* How many bytes are copied each iteration of the word copy loop. */ +#define LITTLEBLOCKSIZE (sizeof (long)) + +/* Threshhold for punting to the byte copier. */ +#define TOO_SMALL(LEN) ((LEN) < BIGBLOCKSIZE) + +void *memcpy(void *dst0, const void *src0, size_t len0) { - char *dstChar = dst; - const char *srcChar = src; - for (size_t i = 0; i < n; i++) { +#if 0 + char *dstChar = dst0; + const char *srcChar = src0; + for (size_t i = 0; i < len0; i++) { *(dstChar++) = *(srcChar++); } - return dst; + return dst0; +#else + char *dst = dst0; + const char *src = src0; + long *aligned_dst; + const long *aligned_src; + + /* If the size is small, or either SRC or DST is unaligned, + then punt into the byte copy loop. This should be rare. */ + if (!TOO_SMALL(len0) && !UNALIGNED(src, dst)) { + aligned_dst = (long *)dst; + aligned_src = (long *)src; + + /* Copy 4X long words at a time if possible. */ + while (len0 >= BIGBLOCKSIZE) { + *aligned_dst++ = *aligned_src++; + *aligned_dst++ = *aligned_src++; + *aligned_dst++ = *aligned_src++; + *aligned_dst++ = *aligned_src++; + len0 -= BIGBLOCKSIZE; + } + + /* Copy one long word at a time if possible. */ + while (len0 >= LITTLEBLOCKSIZE) { + *aligned_dst++ = *aligned_src++; + len0 -= LITTLEBLOCKSIZE; + } + + /* Pick up any residual with a byte copier. */ + dst = (char *)aligned_dst; + src = (char *)aligned_src; + } + + while (len0--) + *dst++ = *src++; + + return dst0; +#endif } void *memset(void *src, int c, size_t n) diff --git a/core/time.c b/core/time.c index 4bd7f42..b08b77d 100644 --- a/core/time.c +++ b/core/time.c @@ -27,3 +27,10 @@ unsigned long usecs_to_jiffies(const unsigned int u) // This could overflow return (u * HZ) / 1000000L; } + +#include +inline uint64_t read_cycle_counter() +{ + uint64_t tsc = __rdtsc(); + return tsc; +} diff --git a/core/time.h b/core/time.h index 699b33d..a5a4bdd 100644 --- a/core/time.h +++ b/core/time.h @@ -1,4 +1,5 @@ #pragma once +#include #define HZ 100 /* @@ -50,3 +51,5 @@ unsigned int jiffies_to_msecs(const unsigned long j); unsigned int jiffies_to_usecs(const unsigned long j); unsigned long msecs_to_jiffies(const unsigned int m); unsigned long usecs_to_jiffies(const unsigned int u); + +uint64_t read_cycle_counter(); diff --git a/tests/test.c b/tests/test.c index 2a87674..b1a11c1 100644 --- a/tests/test.c +++ b/tests/test.c @@ -11,6 +11,29 @@ #include "synchro.h" #include "time.h" +void testMemcpyPerf() +{ + struct test_struct { + char data[4096]; + }; + // instantiate 2 structs. for our purposes, we don't care what data is in + // there. set them to `volatile` so the compiler won't optimize away what we + // do with them + volatile struct test_struct dest, source; + + printf("Test Memcpy perf\n"); + // run through powers-of-two memcpy's, printing stats for each test + for (size_t len = 1; len <= sizeof(dest); len <<= 1) { + uint32_t start = read_cycle_counter(); // << Start count + memcpy((void *)&dest, (void *)&source, len); + uint32_t stop = read_cycle_counter(); // << Stop count + + // print out the cycles consumed + printf("len = %d, %d %d cyccnt = %d, cycles/byte = %d\n", (uint32_t)len, stop, start, + stop - start, (stop - start) / len); + } +} + void testPhymem(void) { printf("Testing memory PHY\n"); @@ -47,7 +70,8 @@ void testPhymem(void) assert(freePageStatFree == freePageStatBegin); assert(usedPageStatFree == usedPageStatBegin); - assertmsg((page = (struct phyMemDesc *)allocPhyPage(1)) != NULL, "Cannot allocate memory\n"); + assertmsg((page = (struct phyMemDesc *)allocPhyPage(1)) != NULL, + "Cannot allocate memory\n"); unrefPhyPage((ulong)page); } @@ -137,7 +161,8 @@ static void testPaging(void) } printf("%d pages freed\n", freeCount); - assertmsg((page = (struct phyMemDesc *)allocPhyPage(1)) != NULL, "Cannot allocate memory\n"); + assertmsg((page = (struct phyMemDesc *)allocPhyPage(1)) != NULL, + "Cannot allocate memory\n"); unrefPhyPage((ulong)page); } @@ -315,6 +340,7 @@ void testKthread() void run_test(void) { + testMemcpyPerf(); { int test = 1000; long long int test64 = 0x100000000;