Skip to content

Commit 936546d

Browse files
committed
Optimize pg_comp_crc32c_sse42 routine slightly, and also use it on x86.
Eliminate the separate 'len' variable from the loops, and also use the 4 byte instruction. This shaves off a few more cycles. Even though this routine that uses the special SSE 4.2 instructions is much faster than a generic routine, it's still a hot spot, so let's make it as fast as possible. Change the configure test to not test _mm_crc32_u64. That variant is only available in the 64-bit x86-64 architecture, not in 32-bit x86. Modify pg_comp_crc32c_sse42 so that it only uses _mm_crc32_u64 on x86-64. With these changes, the SSE accelerated CRC-32C implementation can also be used on 32-bit x86 systems. This also fixes the 32-bit MSVC build.
1 parent b73e7a0 commit 936546d

File tree

3 files changed

+41
-22
lines changed

3 files changed

+41
-22
lines changed

config/c-compiler.m4

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -476,20 +476,24 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
476476

477477
# PGAC_SSE42_CRC32_INTRINSICS
478478
# -----------------------
479-
# Check if the compiler supports _mm_crc32_u8 and _mm_crc32_u64 intrinsics.
479+
# Check if the compiler supports the x86 CRC instructions added in SSE 4.2,
480+
# using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't
481+
# test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if
482+
# the other ones are, on x86-64 platforms)
483+
#
480484
# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
481485
# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42.
482486
AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
483487
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
484-
AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=$1], [Ac_cachevar],
488+
AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
485489
[pgac_save_CFLAGS=$CFLAGS
486490
CFLAGS="$pgac_save_CFLAGS $1"
487491
ac_save_c_werror_flag=$ac_c_werror_flag
488492
ac_c_werror_flag=yes
489493
AC_TRY_LINK([#include <nmmintrin.h>],
490494
[unsigned int crc = 0;
491495
crc = _mm_crc32_u8(crc, 0);
492-
crc = (unsigned int) _mm_crc32_u64(crc, 0);],
496+
crc = _mm_crc32_u32(crc, 0);],
493497
[Ac_cachevar=yes],
494498
[Ac_cachevar=no])
495499
ac_c_werror_flag=$ac_save_c_werror_flag

configure

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14172,8 +14172,8 @@ fi
1417214172
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
1417314173
# with the default compiler flags. If not, check if adding the -msse4.2
1417414174
# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required.
14175-
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=" >&5
14176-
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=... " >&6; }
14175+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
14176+
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
1417714177
if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
1417814178
$as_echo_n "(cached) " >&6
1417914179
else
@@ -14189,7 +14189,7 @@ main ()
1418914189
{
1419014190
unsigned int crc = 0;
1419114191
crc = _mm_crc32_u8(crc, 0);
14192-
crc = (unsigned int) _mm_crc32_u64(crc, 0);
14192+
crc = _mm_crc32_u32(crc, 0);
1419314193
;
1419414194
return 0;
1419514195
}
@@ -14212,8 +14212,8 @@ if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
1421214212
fi
1421314213

1421414214
if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
14215-
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=-msse4.2" >&5
14216-
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u64 with CFLAGS=-msse4.2... " >&6; }
14215+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
14216+
$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
1421714217
if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
1421814218
$as_echo_n "(cached) " >&6
1421914219
else
@@ -14229,7 +14229,7 @@ main ()
1422914229
{
1423014230
unsigned int crc = 0;
1423114231
crc = _mm_crc32_u8(crc, 0);
14232-
crc = (unsigned int) _mm_crc32_u64(crc, 0);
14232+
crc = _mm_crc32_u32(crc, 0);
1423314233
;
1423414234
return 0;
1423514235
}

src/port/pg_crc32c_sse42.c

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,30 +22,45 @@ pg_crc32c
2222
pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
2323
{
2424
const unsigned char *p = data;
25-
const uint64 *p8;
25+
const unsigned char *pend = p + len;
2626

2727
/*
2828
* Process eight bytes of data at a time.
2929
*
30-
* NB: We do unaligned 8-byte accesses here. The Intel architecture
31-
* allows that, and performance testing didn't show any performance
32-
* gain from aligning the beginning address.
30+
* NB: We do unaligned accesses here. The Intel architecture allows that,
31+
* and performance testing didn't show any performance gain from aligning
32+
* the begin address.
3333
*/
34-
p8 = (const uint64 *) p;
35-
while (len >= 8)
34+
#ifdef __x86_64__
35+
while (p + 8 <= pend)
3636
{
37-
crc = (uint32) _mm_crc32_u64(crc, *p8++);
38-
len -= 8;
37+
crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
38+
p += 8;
3939
}
4040

41+
/* Process remaining full four bytes if any */
42+
if (p + 4 <= pend)
43+
{
44+
crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
45+
p += 4;
46+
}
47+
#else
4148
/*
42-
* Handle any remaining bytes one at a time.
49+
* Process four bytes at a time. (The eight byte instruction is not
50+
* available on the 32-bit x86 architecture).
4351
*/
44-
p = (const unsigned char *) p8;
45-
while (len > 0)
52+
while (p + 4 <= pend)
53+
{
54+
crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
55+
p += 4;
56+
}
57+
#endif /* __x86_64__ */
58+
59+
/* Process any remaining bytes one at a time. */
60+
while (p < pend)
4661
{
47-
crc = _mm_crc32_u8(crc, *p++);
48-
len--;
62+
crc = _mm_crc32_u8(crc, *p);
63+
p++;
4964
}
5065

5166
return crc;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy