#ifndef MAD_MONO_AVX512_TC #define MAD_MONO_AVX512_TC /* o-----------------------------------------------------------------------------o | | AVX512 optimization for monimials | | Methodical Accelerator Design - Copyright (c) 2016+ | Support: http://cern.ch/mad - mad at cern.ch | Authors: L. Deniau, laurent.deniau at cern.ch | Contrib: - | o-----------------------------------------------------------------------------o | You can redistribute this file and/or modify it under the terms of the GNU | General Public License GPLv3 (or later), as published by the Free Software | Foundation. This file is distributed in the hope that it will be useful, but | WITHOUT ANY WARRANTY OF ANY KIND. See http://gnu.org/licenses for details. o-----------------------------------------------------------------------------o */ /* Warning: this code was never tested because I don't have access to AVX512 yet */ #include "mad_sse.h" static inline void dump(__m512i ra, __m512i rb, __mmask64 rr) { printf("\nra="); for (int i=0; i < MAD_AVX512_CSIZ; ++i) printf("%02x ", ((ord_t*)&ra)[i]); printf("\nrb="); for (int i=0; i < MAD_AVX512_CSIZ; ++i) printf("%02x ", ((ord_t*)&rb)[i]); printf("\nrr="); for (int i=0; i < MAD_AVX512_CSIZ; ++i) printf("%02x ", ((ord_t*)&rr)[i]); printf("\n"); } int mad_mono_equ (ssz_t n, const ord_t a[n], const ord_t b[n]) { assert(a && b); int nn=MAD_AVX512_CRND(n), nm=MAD_AVX512_CMOD(n); __m512i ra, rb; for (int i=0; i < nn; i+=MAD_AVX512_CSIZ) { ra = _mm512_loadu_si512((__m512i*)&a[i]); rb = _mm512_loadu_si512((__m512i*)&b[i]); if (~_mm512_cmpeq_epu8_mask(ra,rb)) return 0; } if (nm) { __mmask64 mask = (1ul << nm) - 1; __m512i zero = _mm512_setzero_si512(); ra = _mm512_mask_loadu_epi8(zero, mask, &a[nn]); rb = _mm512_mask_loadu_epi8(zero, mask, &b[nn]); if (~_mm512_cmpeq_epu8_mask(ra,rb)) return 0; } return 1; } int mad_mono_lt (ssz_t n, const ord_t a[n], const ord_t b[n]) { assert(a && b); int nn=MAD_AVX512_CRND(n), nm=MAD_AVX512_CMOD(n); __m512i ra, rb; for (int i=0; i < nn; i+=MAD_AVX512_CSIZ) { ra = _mm512_loadu_si512((__m512i*)&a[i]); rb = _mm512_loadu_si512((__m512i*)&b[i]); if (_mm512_cmpge_epu8_mask(ra,rb)) return 0; } if (nm) { __mmask64 mask = (1ul << nm) - 1; __m512i zero = _mm512_setzero_si512(); ra = _mm512_mask_loadu_epi8(zero, mask, &a[nn]); rb = _mm512_mask_loadu_epi8(zero, mask, &b[nn]); if (_mm512_cmpge_epu8_mask(ra,rb)) return 0; } return 1; } int mad_mono_le (ssz_t n, const ord_t a[n], const ord_t b[n]) { assert(a && b); int nn=MAD_AVX512_CRND(n), nm=MAD_AVX512_CMOD(n); __m512i ra, rb; for (int i=0; i < nn; i+=MAD_AVX512_CSIZ) { ra = _mm512_loadu_si512((__m512i*)&a[i]); rb = _mm512_loadu_si512((__m512i*)&b[i]); if (_mm512_cmpgt_epu8_mask(ra,rb)) return 0; } if (nm) { __mmask64 mask = (1ul << nm) - 1; __m512i zero = _mm512_setzero_si512(); ra = _mm512_mask_loadu_epi8(zero, mask, &a[nn]); rb = _mm512_mask_loadu_epi8(zero, mask, &b[nn]); if (_mm512_cmpgt_epu8_mask(ra,rb)) return 0; } return 1; } int mad_mono_gt (ssz_t n, const ord_t a[n], const ord_t b[n]) { return !mad_mono_le(n,a,b); } int mad_mono_ge (ssz_t n, const ord_t a[n], const ord_t b[n]) { return !mad_mono_lt(n,a,b); } int mad_mono_rcmp (ssz_t n, const ord_t a[n], const ord_t b[n]) { assert(a && b); int nn=MAD_AVX512_CRND(n), nm=MAD_AVX512_CMOD(n); __m512i ra, rb; __mmask64 rr; if (nm) { __mmask64 mask = (1ul << nm) - 1; __m512i zero = _mm512_setzero_si512(); ra = _mm512_mask_loadu_epi8(zero, mask, &a[nn]); rb = _mm512_mask_loadu_epi8(zero, mask, &b[nn]); rr = _mm512_cmpeq_epu8_mask(ra,rb); if (~rr) for (int i=nm-1; i >= 0; i--) if (!(rr & (1ull<= 0; i-=MAD_AVX512_CSIZ) { ra = _mm512_loadu_si512((__m512i*)&a[i]); rb = _mm512_loadu_si512((__m512i*)&b[i]); rr = _mm512_cmpeq_epu8_mask(ra,rb); if (~rr) for (int i=MAD_AVX512_CSIZ-2; i >= 0; i-=2) { if (!(rr & (1ull<<(i+1)))) return ((ord_t*)&ra)[i+1]-((ord_t*)&rb)[i+1]; if (!(rr & (1ull<<(i+0)))) return ((ord_t*)&ra)[i+0]-((ord_t*)&rb)[i+0]; } } return 0; } ord_t mad_mono_min (ssz_t n, const ord_t a[n]) { assert(a); int nn=MAD_AVX512_CRND(n), nm=MAD_AVX512_CMOD(n); __m512i ra, rr = _mm512_set1_epi8(-1); for (int i=0; i < nn; i+=MAD_AVX2_CSIZ) { ra = _mm512_loadu_si512((__m512i*)&a[i]); rr = _mm512_min_epu8(rr,ra); } if (nm) { __mmask64 mask = (1ul << nm) - 1; __m512i max = _mm512_set1_epi8(-1); ra = _mm512_mask_loadu_epi8(max, mask, &a[nn]); rr = _mm512_min_epu8(rr,ra); } int ni = nn ? MAD_AVX512_CSIZ : nm; ord_t m0 = -1, m1 = -1; for (int i=0; i < ni; i+=2) { if (m0 > ((ord_t*)&rr)[i+0]) m0 = ((ord_t*)&rr)[i+0]; if (m1 > ((ord_t*)&rr)[i+1]) m1 = ((ord_t*)&rr)[i+1]; } return m0 <= m1 ? m0 : m1; } ord_t mad_mono_max (ssz_t n, const ord_t a[n]) { assert(a); int nn=MAD_AVX512_CRND(n), nm=MAD_AVX512_CMOD(n); __m512i ra, rr = _mm512_setzero_si512(); for (int i=0; i < nn; i+=MAD_AVX2_CSIZ) { ra = _mm512_loadu_si512((__m512i*)&a[i]); rr = _mm512_max_epu8(rr,ra); } if (nm) { __mmask64 mask = (1ul << nm) - 1; __m512i zero = _mm512_setzero_si512(); ra = _mm512_mask_loadu_epi8(zero, mask, &a[nn]); rr = _mm512_max_epu8(rr,ra); } int ni = nn ? MAD_AVX512_CSIZ : nm; ord_t m0 = 0, m1 = 0; for (int i=0; i < ni; i+=2) { if (m0 < ((ord_t*)&rr)[i+0]) m0 = ((ord_t*)&rr)[i+0]; if (m1 < ((ord_t*)&rr)[i+1]) m1 = ((ord_t*)&rr)[i+1]; } return m0 >= m1 ? m0 : m1; } // --- end --------------------------------------------------------------------o #endif // MAD_MONO_AVX512_TC