結果
問題 | No.135 とりあえず1次元の問題 |
ユーザー | ななり |
提出日時 | 2024-09-10 10:39:58 |
言語 | C++23(gcc13) (gcc 13.2.0 + boost 1.83.0) |
結果 |
RE
|
実行時間 | - |
コード長 | 25,704 bytes |
コンパイル時間 | 3,177 ms |
コンパイル使用メモリ | 265,536 KB |
実行使用メモリ | 6,948 KB |
最終ジャッジ日時 | 2024-09-10 10:40:06 |
合計ジャッジ時間 | 6,868 ms |
ジャッジサーバーID (参考情報) |
judge1 / judge3 |
(要ログイン)
テストケース
テストケース表示入力 | 結果 | 実行時間 実行使用メモリ |
---|---|---|
testcase_00 | RE | - |
testcase_01 | RE | - |
testcase_02 | RE | - |
testcase_03 | RE | - |
testcase_04 | RE | - |
testcase_05 | RE | - |
testcase_06 | RE | - |
testcase_07 | RE | - |
testcase_08 | RE | - |
testcase_09 | RE | - |
testcase_10 | RE | - |
testcase_11 | RE | - |
testcase_12 | RE | - |
testcase_13 | RE | - |
testcase_14 | RE | - |
testcase_15 | RE | - |
testcase_16 | RE | - |
testcase_17 | RE | - |
testcase_18 | RE | - |
testcase_19 | RE | - |
testcase_20 | RE | - |
testcase_21 | RE | - |
testcase_22 | RE | - |
evil01.txt | RE | - |
ソースコード
#pragma GCC diagnostic ignored "-Warray-bounds" #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #pragma GCC diagnostic ignored "-Wunused-result" #pragma GCC diagnostic ignored "-Wuninitialized" #pragma GCC optimize "Ofast" #pragma GCC target "avx512dq,avx512f" #include <algorithm> #include <cstdio> #include <cstdint> #include <immintrin.h> #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d #define NETWORK_32BIT_1 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 #define NETWORK_32BIT_2 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 #define NETWORK_32BIT_3 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4 struct avx512_32bit_swizzle_ops { template <typename vtype, int scale> static inline __m512i swap_n(__m512i reg) { if constexpr (scale == 2) return _mm512_shuffle_epi32(reg, (_MM_PERM_ENUM)0b10110001); else if constexpr (scale == 4) return _mm512_shuffle_epi32(reg, (_MM_PERM_ENUM)0b01001110); else if constexpr (scale == 8) return _mm512_shuffle_i64x2(reg, reg, 0b10110001); else if constexpr (scale == 16) return _mm512_shuffle_i64x2(reg, reg, 0b01001110); } template <typename vtype, int scale> static inline __m512i reverse_n(__m512i reg) { if constexpr (scale == 2) return swap_n<vtype, 2>(reg); else if constexpr (scale == 4) return _mm512_permutexvar_epi32(_mm512_set_epi32(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3), reg); else if constexpr (scale == 8) return _mm512_permutexvar_epi32(_mm512_set_epi32(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7), reg); else if constexpr (scale == 16) return _mm512_permutexvar_epi32(_mm512_set_epi32(NETWORK_32BIT_2), reg); } template <typename vtype, int scale> static inline __m512i merge_n(__m512i reg, __m512i other) { if constexpr (scale == 2) return _mm512_mask_blend_epi32(0b0101010101010101, reg, other); else if constexpr (scale == 4) return _mm512_mask_blend_epi32(0b0011001100110011, reg, other); else if constexpr (scale == 8) return _mm512_mask_blend_epi32(0b0000111100001111, reg, other); else if constexpr (scale == 16) return _mm512_mask_blend_epi32(0b0000000011111111, reg, other); } }; template <typename type> struct zmm_vector; template <typename vtype> static inline void COEX(__m512i &a, __m512i &b) { __m512i temp = a; a = _mm512_min_epi32(a, b); b = _mm512_max_epi32(temp, b); } template <typename vtype> static inline int32_t get_pivot(int32_t *arr, const int32_t left, const int32_t right) { int32_t samples[16], delta = (right - left) >> 4; for (int i = 0; i < 16; ++i) samples[i] = arr[left + i * delta]; __m512i rand_vec = _mm512_loadu_si512(samples), sort = vtype::sort_vec(rand_vec); return ((int32_t*)&sort)[8]; } template <typename vtype> static inline int32_t get_pivot_blocks(int32_t *arr, const int32_t left, const int32_t right) { if (right - left <= 1024) return get_pivot<vtype>(arr, left, right); int32_t delta = (right - 16 - left) / 5; __m512i vecs[5]; for (int i = 0; i < 5; ++i) vecs[i] = _mm512_loadu_si512(arr + left + delta * i); COEX<vtype>(vecs[0], vecs[3]); COEX<vtype>(vecs[1], vecs[4]); COEX<vtype>(vecs[0], vecs[2]); COEX<vtype>(vecs[1], vecs[3]); COEX<vtype>(vecs[0], vecs[1]); COEX<vtype>(vecs[2], vecs[4]); COEX<vtype>(vecs[1], vecs[2]); COEX<vtype>(vecs[3], vecs[4]); COEX<vtype>(vecs[2], vecs[3]); __m512i &vec = vecs[2]; vec = vtype::sort_vec(vec); int32_t data[16]; _mm512_storeu_si512(data, vec); return data[8]; } template <typename vtype, int numVecs> static inline void bitonic_sort_n_vec(__m512i *regs) { if constexpr (numVecs == 1) return; else if constexpr (numVecs == 2) COEX<vtype>(regs[0], regs[1]); else if constexpr (numVecs == 4) { COEX<vtype>(regs[0], regs[2]); COEX<vtype>(regs[1], regs[3]); COEX<vtype>(regs[0], regs[1]); COEX<vtype>(regs[2], regs[3]); COEX<vtype>(regs[1], regs[2]); } else if constexpr (numVecs == 8) { COEX<vtype>(regs[0], regs[2]); COEX<vtype>(regs[1], regs[3]); COEX<vtype>(regs[4], regs[6]); COEX<vtype>(regs[5], regs[7]); COEX<vtype>(regs[0], regs[4]); COEX<vtype>(regs[1], regs[5]); COEX<vtype>(regs[2], regs[6]); COEX<vtype>(regs[3], regs[7]); COEX<vtype>(regs[0], regs[1]); COEX<vtype>(regs[2], regs[3]); COEX<vtype>(regs[4], regs[5]); COEX<vtype>(regs[6], regs[7]); COEX<vtype>(regs[2], regs[4]); COEX<vtype>(regs[3], regs[5]); COEX<vtype>(regs[1], regs[4]); COEX<vtype>(regs[3], regs[6]); COEX<vtype>(regs[1], regs[2]); COEX<vtype>(regs[3], regs[4]); COEX<vtype>(regs[5], regs[6]); } else if constexpr (numVecs == 16) { COEX<vtype>(regs[0], regs[13]); COEX<vtype>(regs[1], regs[12]); COEX<vtype>(regs[2], regs[15]); COEX<vtype>(regs[3], regs[14]); COEX<vtype>(regs[4], regs[8]); COEX<vtype>(regs[5], regs[6]); COEX<vtype>(regs[7], regs[11]); COEX<vtype>(regs[9], regs[10]); COEX<vtype>(regs[0], regs[5]); COEX<vtype>(regs[1], regs[7]); COEX<vtype>(regs[2], regs[9]); COEX<vtype>(regs[3], regs[4]); COEX<vtype>(regs[6], regs[13]); COEX<vtype>(regs[8], regs[14]); COEX<vtype>(regs[10], regs[15]); COEX<vtype>(regs[11], regs[12]); COEX<vtype>(regs[0], regs[1]); COEX<vtype>(regs[2], regs[3]); COEX<vtype>(regs[4], regs[5]); COEX<vtype>(regs[6], regs[8]); COEX<vtype>(regs[7], regs[9]); COEX<vtype>(regs[10], regs[11]); COEX<vtype>(regs[12], regs[13]); COEX<vtype>(regs[14], regs[15]); COEX<vtype>(regs[0], regs[2]); COEX<vtype>(regs[1], regs[3]); COEX<vtype>(regs[4], regs[10]); COEX<vtype>(regs[5], regs[11]); COEX<vtype>(regs[6], regs[7]); COEX<vtype>(regs[8], regs[9]); COEX<vtype>(regs[12], regs[14]); COEX<vtype>(regs[13], regs[15]); COEX<vtype>(regs[1], regs[2]); COEX<vtype>(regs[3], regs[12]); COEX<vtype>(regs[4], regs[6]); COEX<vtype>(regs[5], regs[7]); COEX<vtype>(regs[8], regs[10]); COEX<vtype>(regs[9], regs[11]); COEX<vtype>(regs[13], regs[14]); COEX<vtype>(regs[1], regs[4]); COEX<vtype>(regs[2], regs[6]); COEX<vtype>(regs[5], regs[8]); COEX<vtype>(regs[7], regs[10]); COEX<vtype>(regs[9], regs[13]); COEX<vtype>(regs[11], regs[14]); COEX<vtype>(regs[2], regs[4]); COEX<vtype>(regs[3], regs[6]); COEX<vtype>(regs[9], regs[12]); COEX<vtype>(regs[11], regs[13]); COEX<vtype>(regs[3], regs[5]); COEX<vtype>(regs[6], regs[8]); COEX<vtype>(regs[7], regs[9]); COEX<vtype>(regs[10], regs[12]); COEX<vtype>(regs[3], regs[4]); COEX<vtype>(regs[5], regs[6]); COEX<vtype>(regs[7], regs[8]); COEX<vtype>(regs[9], regs[10]); COEX<vtype>(regs[11], regs[12]); COEX<vtype>(regs[6], regs[7]); COEX<vtype>(regs[8], regs[9]); } else if constexpr (numVecs == 32) { COEX<vtype>(regs[0], regs[1]); COEX<vtype>(regs[2], regs[3]); COEX<vtype>(regs[4], regs[5]); COEX<vtype>(regs[6], regs[7]); COEX<vtype>(regs[8], regs[9]); COEX<vtype>(regs[10], regs[11]); COEX<vtype>(regs[12], regs[13]); COEX<vtype>(regs[14], regs[15]); COEX<vtype>(regs[16], regs[17]); COEX<vtype>(regs[18], regs[19]); COEX<vtype>(regs[20], regs[21]); COEX<vtype>(regs[22], regs[23]); COEX<vtype>(regs[24], regs[25]); COEX<vtype>(regs[26], regs[27]); COEX<vtype>(regs[28], regs[29]); COEX<vtype>(regs[30], regs[31]); COEX<vtype>(regs[0], regs[2]); COEX<vtype>(regs[1], regs[3]); COEX<vtype>(regs[4], regs[6]); COEX<vtype>(regs[5], regs[7]); COEX<vtype>(regs[8], regs[10]); COEX<vtype>(regs[9], regs[11]); COEX<vtype>(regs[12], regs[14]); COEX<vtype>(regs[13], regs[15]); COEX<vtype>(regs[16], regs[18]); COEX<vtype>(regs[17], regs[19]); COEX<vtype>(regs[20], regs[22]); COEX<vtype>(regs[21], regs[23]); COEX<vtype>(regs[24], regs[26]); COEX<vtype>(regs[25], regs[27]); COEX<vtype>(regs[28], regs[30]); COEX<vtype>(regs[29], regs[31]); COEX<vtype>(regs[0], regs[4]); COEX<vtype>(regs[1], regs[5]); COEX<vtype>(regs[2], regs[6]); COEX<vtype>(regs[3], regs[7]); COEX<vtype>(regs[8], regs[12]); COEX<vtype>(regs[9], regs[13]); COEX<vtype>(regs[10], regs[14]); COEX<vtype>(regs[11], regs[15]); COEX<vtype>(regs[16], regs[20]); COEX<vtype>(regs[17], regs[21]); COEX<vtype>(regs[18], regs[22]); COEX<vtype>(regs[19], regs[23]); COEX<vtype>(regs[24], regs[28]); COEX<vtype>(regs[25], regs[29]); COEX<vtype>(regs[26], regs[30]); COEX<vtype>(regs[27], regs[31]); COEX<vtype>(regs[0], regs[8]); COEX<vtype>(regs[1], regs[9]); COEX<vtype>(regs[2], regs[10]); COEX<vtype>(regs[3], regs[11]); COEX<vtype>(regs[4], regs[12]); COEX<vtype>(regs[5], regs[13]); COEX<vtype>(regs[6], regs[14]); COEX<vtype>(regs[7], regs[15]); COEX<vtype>(regs[16], regs[24]); COEX<vtype>(regs[17], regs[25]); COEX<vtype>(regs[18], regs[26]); COEX<vtype>(regs[19], regs[27]); COEX<vtype>(regs[20], regs[28]); COEX<vtype>(regs[21], regs[29]); COEX<vtype>(regs[22], regs[30]); COEX<vtype>(regs[23], regs[31]); COEX<vtype>(regs[0], regs[16]); COEX<vtype>(regs[1], regs[8]); COEX<vtype>(regs[2], regs[4]); COEX<vtype>(regs[3], regs[12]); COEX<vtype>(regs[5], regs[10]); COEX<vtype>(regs[6], regs[9]); COEX<vtype>(regs[7], regs[14]); COEX<vtype>(regs[11], regs[13]); COEX<vtype>(regs[15], regs[31]); COEX<vtype>(regs[17], regs[24]); COEX<vtype>(regs[18], regs[20]); COEX<vtype>(regs[19], regs[28]); COEX<vtype>(regs[21], regs[26]); COEX<vtype>(regs[22], regs[25]); COEX<vtype>(regs[23], regs[30]); COEX<vtype>(regs[27], regs[29]); COEX<vtype>(regs[1], regs[2]); COEX<vtype>(regs[3], regs[5]); COEX<vtype>(regs[4], regs[8]); COEX<vtype>(regs[6], regs[22]); COEX<vtype>(regs[7], regs[11]); COEX<vtype>(regs[9], regs[25]); COEX<vtype>(regs[10], regs[12]); COEX<vtype>(regs[13], regs[14]); COEX<vtype>(regs[17], regs[18]); COEX<vtype>(regs[19], regs[21]); COEX<vtype>(regs[20], regs[24]); COEX<vtype>(regs[23], regs[27]); COEX<vtype>(regs[26], regs[28]); COEX<vtype>(regs[29], regs[30]); COEX<vtype>(regs[1], regs[17]); COEX<vtype>(regs[2], regs[18]); COEX<vtype>(regs[3], regs[19]); COEX<vtype>(regs[4], regs[20]); COEX<vtype>(regs[5], regs[10]); COEX<vtype>(regs[7], regs[23]); COEX<vtype>(regs[8], regs[24]); COEX<vtype>(regs[11], regs[27]); COEX<vtype>(regs[12], regs[28]); COEX<vtype>(regs[13], regs[29]); COEX<vtype>(regs[14], regs[30]); COEX<vtype>(regs[21], regs[26]); COEX<vtype>(regs[3], regs[17]); COEX<vtype>(regs[4], regs[16]); COEX<vtype>(regs[5], regs[21]); COEX<vtype>(regs[6], regs[18]); COEX<vtype>(regs[7], regs[9]); COEX<vtype>(regs[8], regs[20]); COEX<vtype>(regs[10], regs[26]); COEX<vtype>(regs[11], regs[23]); COEX<vtype>(regs[13], regs[25]); COEX<vtype>(regs[14], regs[28]); COEX<vtype>(regs[15], regs[27]); COEX<vtype>(regs[22], regs[24]); COEX<vtype>(regs[1], regs[4]); COEX<vtype>(regs[3], regs[8]); COEX<vtype>(regs[5], regs[16]); COEX<vtype>(regs[7], regs[17]); COEX<vtype>(regs[9], regs[21]); COEX<vtype>(regs[10], regs[22]); COEX<vtype>(regs[11], regs[19]); COEX<vtype>(regs[12], regs[20]); COEX<vtype>(regs[14], regs[24]); COEX<vtype>(regs[15], regs[26]); COEX<vtype>(regs[23], regs[28]); COEX<vtype>(regs[27], regs[30]); COEX<vtype>(regs[2], regs[5]); COEX<vtype>(regs[7], regs[8]); COEX<vtype>(regs[9], regs[18]); COEX<vtype>(regs[11], regs[17]); COEX<vtype>(regs[12], regs[16]); COEX<vtype>(regs[13], regs[22]); COEX<vtype>(regs[14], regs[20]); COEX<vtype>(regs[15], regs[19]); COEX<vtype>(regs[23], regs[24]); COEX<vtype>(regs[26], regs[29]); COEX<vtype>(regs[2], regs[4]); COEX<vtype>(regs[6], regs[12]); COEX<vtype>(regs[9], regs[16]); COEX<vtype>(regs[10], regs[11]); COEX<vtype>(regs[13], regs[17]); COEX<vtype>(regs[14], regs[18]); COEX<vtype>(regs[15], regs[22]); COEX<vtype>(regs[19], regs[25]); COEX<vtype>(regs[20], regs[21]); COEX<vtype>(regs[27], regs[29]); COEX<vtype>(regs[5], regs[6]); COEX<vtype>(regs[8], regs[12]); COEX<vtype>(regs[9], regs[10]); COEX<vtype>(regs[11], regs[13]); COEX<vtype>(regs[14], regs[16]); COEX<vtype>(regs[15], regs[17]); COEX<vtype>(regs[18], regs[20]); COEX<vtype>(regs[19], regs[23]); COEX<vtype>(regs[21], regs[22]); COEX<vtype>(regs[25], regs[26]); COEX<vtype>(regs[3], regs[5]); COEX<vtype>(regs[6], regs[7]); COEX<vtype>(regs[8], regs[9]); COEX<vtype>(regs[10], regs[12]); COEX<vtype>(regs[11], regs[14]); COEX<vtype>(regs[13], regs[16]); COEX<vtype>(regs[15], regs[18]); COEX<vtype>(regs[17], regs[20]); COEX<vtype>(regs[19], regs[21]); COEX<vtype>(regs[22], regs[23]); COEX<vtype>(regs[24], regs[25]); COEX<vtype>(regs[26], regs[28]); COEX<vtype>(regs[3], regs[4]); COEX<vtype>(regs[5], regs[6]); COEX<vtype>(regs[7], regs[8]); COEX<vtype>(regs[9], regs[10]); COEX<vtype>(regs[11], regs[12]); COEX<vtype>(regs[13], regs[14]); COEX<vtype>(regs[15], regs[16]); COEX<vtype>(regs[17], regs[18]); COEX<vtype>(regs[19], regs[20]); COEX<vtype>(regs[21], regs[22]); COEX<vtype>(regs[23], regs[24]); COEX<vtype>(regs[25], regs[26]); COEX<vtype>(regs[27], regs[28]); } } template <typename vtype, int numVecs, int scale, bool first = true> static inline void internal_merge_n_vec(__m512i *reg) { using swizzle = typename vtype::swizzle_ops; if constexpr (scale <= 1) return; else { if constexpr (first) { for (int i = 0; i < numVecs; ++i) { __m512i &v = reg[i]; __m512i rev = swizzle::template reverse_n<vtype, scale>(v); COEX<vtype>(rev, v); v = swizzle::template merge_n<vtype, scale>(v, rev); } } else { for (int i = 0; i < numVecs; ++i) { __m512i &v = reg[i]; __m512i swap = swizzle::template swap_n<vtype, scale>(v); COEX<vtype>(swap, v); v = swizzle::template merge_n<vtype, scale>(v, swap); } } internal_merge_n_vec<vtype, numVecs, (scale >> 1), false>(reg); } } template <typename vtype, int numVecs, int scale> static inline void merge_substep_n_vec(__m512i *regs) { if constexpr (numVecs <= 1) return; using swizzle = typename vtype::swizzle_ops; constexpr int h = numVecs >> 1; for (int i = h; i < numVecs; ++i) regs[i] = swizzle::template reverse_n<vtype, scale>(regs[i]); for (int i = 0; i < h; ++i) COEX<vtype>(regs[i], regs[numVecs - 1 - i]); merge_substep_n_vec<vtype, h, scale>(regs); merge_substep_n_vec<vtype, h, scale>(regs + h); } template <typename vtype, int numVecs, int scale> static inline void merge_step_n_vec(__m512i *regs) { merge_substep_n_vec<vtype, numVecs, scale>(regs); internal_merge_n_vec<vtype, numVecs, scale>(regs); } template <typename vtype, int numVecs, int numPer = 2> static inline void merge_n_vec(__m512i *regs) { if constexpr (numPer > 16) return; else { merge_step_n_vec<vtype, numVecs, numPer>(regs); merge_n_vec<vtype, numVecs, numPer + numPer>(regs); } } template <typename vtype> static inline bool comparison_func(const int32_t &a, const int32_t &b) { return a < b; } template <typename vtype> int avx512_double_compressstore(int32_t* left_addr, int32_t* right_addr, __mmask16 k, __m512i reg) { int amount_ge_pivot = _mm_popcnt_u32((int)k); _mm512_mask_compressstoreu_epi32(left_addr, _mm512_knot(k), reg); _mm512_mask_compressstoreu_epi32(right_addr + 16 - amount_ge_pivot, k, reg); return amount_ge_pivot; } template <typename vtype> static inline int32_t partition_vec(int32_t *l_store, int32_t *r_store, const __m512i curr_vec, const __m512i pivot_vec, __m512i &smallest_vec, __m512i &biggest_vec) { __mmask16 ge_mask = _mm512_cmp_epi32_mask(curr_vec, pivot_vec, _MM_CMPINT_NLT); int amount_ge_pivot = avx512_double_compressstore<zmm_vector<int32_t>>(l_store, r_store, ge_mask, curr_vec); smallest_vec = _mm512_min_epi32(curr_vec, smallest_vec); biggest_vec = _mm512_max_epi32(curr_vec, biggest_vec); return amount_ge_pivot; } template <typename vtype> static inline int32_t partition_avx512(int32_t *arr, int32_t left, int32_t right, int32_t pivot, int32_t *smallest, int32_t *biggest) { for (int32_t i = (right - left) & 15; i > 0; --i) { *smallest = std::min(*smallest, arr[left], comparison_func<vtype>); *biggest = std::max(*biggest, arr[left], comparison_func<vtype>); if (!comparison_func<vtype>(arr[left], pivot)) std::swap(arr[left], arr[--right]); else ++left; } if (left == right) return left; __m512i pivot_vec = _mm512_set1_epi32(pivot), min_vec = _mm512_set1_epi32(*smallest), max_vec = _mm512_set1_epi32(*biggest); if (right - left == 16) { __m512i vec = _mm512_loadu_si512(arr + left); int32_t unpartitioned = right - left - 16, l_store = left, amount_ge_pivot = partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned, vec, pivot_vec, min_vec, max_vec); *smallest = _mm512_reduce_min_epi32(min_vec); *biggest = _mm512_reduce_max_epi32(max_vec); return l_store + 16 - amount_ge_pivot; } __m512i vec_left = _mm512_loadu_si512(arr + left), vec_right = _mm512_loadu_si512(arr + right - 16); int32_t unpartitioned = right - left - 16, l_store = left; left += 16; right -= 16; while (right - left != 0) { __m512i curr_vec; if (l_store + unpartitioned + 16 - right < left - l_store) { right -= 16; curr_vec = _mm512_loadu_si512(arr + right); } else { curr_vec = _mm512_loadu_si512(arr + left); left += 16; } l_store += 16 - partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned, curr_vec, pivot_vec, min_vec, max_vec); unpartitioned -= 16; } l_store += 16 - partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned, vec_left, pivot_vec, min_vec, max_vec); *smallest = _mm512_reduce_min_epi32(min_vec); *biggest = _mm512_reduce_max_epi32(max_vec); return l_store + 16 - partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned - 16, vec_right, pivot_vec, min_vec, max_vec); } template <typename vtype> static inline int32_t partition_avx512_unrolled(int32_t* arr, int32_t left, int32_t right, int32_t pivot, int32_t* smallest, int32_t* biggest) { if (right - left < 384) return partition_avx512<vtype>(arr, left, right, pivot, smallest, biggest); for (int32_t i = ((right - left) & 15); i > 0; --i) { *smallest = std::min(*smallest, arr[left], comparison_func<vtype>); *biggest = std::max(*biggest, arr[left], comparison_func<vtype>); if (!comparison_func<vtype>(arr[left], pivot)) std::swap(arr[left], arr[--right]); else ++left; } int32_t unpartitioned = right - left - 16, l_store = left; __m512i pivot_vec = _mm512_set1_epi32(pivot), min_vec = _mm512_set1_epi32(*smallest), max_vec = _mm512_set1_epi32(*biggest), vec_align[8]; int vecsToPartition = (right - left) >> 4 & 7; for (int i = 0; i < vecsToPartition; ++i) vec_align[i] = _mm512_loadu_si512(arr + left + (i << 4)); left += (vecsToPartition << 4); __m512i vec_left[8], vec_right[8]; for (int ii = 0; ii < 8; ++ii) { vec_left[ii] = _mm512_loadu_si512(arr + left + (ii << 4)); vec_right[ii] = _mm512_loadu_si512(arr + right - ((8 - ii) << 4)); } left += 128; right -= 128; while (left != right) { __m512i curr_vec[8]; if (l_store + unpartitioned + 16 - right < left - l_store) { right -= 128; for (int ii = 0; ii < 8; ++ii) curr_vec[ii] = _mm512_loadu_si512(arr + right + (ii << 4)); } else { for (int ii = 0; ii < 8; ++ii) curr_vec[ii] = _mm512_loadu_si512(arr + left + (ii << 4)); left += 128; } for (int ii = 0; ii < 8; ++ii) { l_store += 16 - partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned, curr_vec[ii], pivot_vec, min_vec, max_vec); unpartitioned -= 16; } } for (int ii = 0; ii < 8; ++ii) { l_store += 16 - partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned, vec_left[ii], pivot_vec, min_vec, max_vec); unpartitioned -= 16; } for (int ii = 0; ii < 8; ++ii) { l_store += 16 - partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned, vec_right[ii], pivot_vec, min_vec, max_vec); unpartitioned -= 16; } for (int ii = 0; ii < vecsToPartition; ++ii) { l_store += 16 - partition_vec<vtype>(arr + l_store, arr + l_store + unpartitioned, vec_align[ii], pivot_vec, min_vec, max_vec); unpartitioned -= 16; } *smallest = _mm512_reduce_min_epi32(min_vec); *biggest = _mm512_reduce_max_epi32(max_vec); return l_store; } template <typename vtype, int numVecs> static inline void sort_n_vec(int32_t* arr, int N) { constexpr int h = numVecs >> 1; if constexpr (numVecs > 1) { if (N <= (numVecs << 3)) { sort_n_vec<vtype, h>(arr, N); return; } } __m512i vecs[numVecs]; __mmask16 ioMasks[h]; for (int i = h, j = 0; i < numVecs; ++i, ++j) ioMasks[j] = (1 << std::min(std::max(0, N - (i << 4)), 16)) - 1; for (int i = 0; i < h; ++i) vecs[i] = _mm512_loadu_si512(arr + (i << 4)); for (int i = h, j = 0; i < numVecs; ++i, ++j) vecs[i] = _mm512_mask_loadu_epi32(_mm512_set1_epi32(INT32_MAX), ioMasks[j], arr + (i << 4)); bitonic_sort_n_vec<vtype, numVecs>(vecs); merge_n_vec<vtype, numVecs>(vecs); for (int i = 0; i < h; ++i) _mm512_storeu_si512(arr + (i << 4), vecs[i]); for (int i = h, j = 0; i < numVecs; ++i, ++j) _mm512_mask_storeu_epi32(arr + (i << 4), ioMasks[j], vecs[i]); } template <typename vtype> static void qsort_(int32_t* arr, int32_t left, int32_t right, int32_t max_iters) { if (max_iters <= 0) { std::sort(arr + left, arr + right + 1, comparison_func<vtype>); return; } if (right - left <= 511) { sort_n_vec<vtype, 32>(arr + left, right + 1 - left); return; } int32_t pivot = get_pivot_blocks<vtype>(arr, left, right), smallest = INT32_MAX, biggest = INT32_MIN, pivot_index = partition_avx512_unrolled<vtype>(arr, left, right + 1, pivot, &smallest, &biggest); if (pivot != smallest) qsort_<vtype>(arr, left, pivot_index - 1, max_iters - 1); if (pivot != biggest) qsort_<vtype>(arr, pivot_index, right, max_iters - 1); } template <typename vtype> static inline __m512i cmp_merge(__m512i in1, __m512i in2, __mmask16 mask) { return _mm512_mask_mov_epi32(_mm512_min_epi32(in2, in1), mask, _mm512_max_epi32(in2, in1)); } template <typename vtype> static inline __m512i sort_zmm_32bit(__m512i zmm) { zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA); zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(0, 1, 2, 3)>(zmm), 0xCCCC); zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA); zmm = cmp_merge<vtype>(zmm, _mm512_permutexvar_epi32(_mm512_set_epi32(NETWORK_32BIT_1), zmm), 0xF0F0); zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm), 0xCCCC); zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA); zmm = cmp_merge<vtype>(zmm, _mm512_permutexvar_epi32(_mm512_set_epi32(NETWORK_32BIT_2), zmm), 0xFF00); zmm = cmp_merge<vtype>(zmm, _mm512_permutexvar_epi32(_mm512_set_epi32(NETWORK_32BIT_3), zmm), 0xF0F0); zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm), 0xCCCC); zmm = cmp_merge<vtype>(zmm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm), 0xAAAA); return zmm; } template <> struct zmm_vector<int32_t> { using swizzle_ops = avx512_32bit_swizzle_ops; template <uint8_t mask> static __m512i shuffle(__m512i zmm) { return _mm512_shuffle_epi32(zmm, (_MM_PERM_ENUM)mask); } static __m512i sort_vec(__m512i x) { return sort_zmm_32bit<zmm_vector<int32_t>>(x); } }; uint8_t c; int64_t n = 0, z = 1000001; int main() { fseek(stdin, 0, 2); int64_t istrlen = ftell(stdin); uint8_t istr[istrlen], *iptr = istr; fseek(stdin, 0, 0); fread(istr, 1, istrlen, stdin); while ((c = *iptr++) >= '0') n = n * 10 + (c - '0'); int32_t X[n]; for (int64_t i = 0; i < n; ++i) { int32_t x = *(int32_t*)iptr; if ((x | 0x30303030) == x) { int64_t _ = *(int16_t*)(iptr + 4); if ((_ | 0x3030) == _) { if (*(iptr + 6) == '0') { x = 1000000; iptr += 8; } else { x = (((((x & 0xF0F0F0F) * 0xA01) >> 8 & 0xFF00FF) * 0x640001) >> 16) * 100 + (_ & 0xF) * 10 + (_ >> 8 & 0xF); iptr += 7; } } else if ((_ | 0x30) == _) { x = (((((x & 0xF0F0F0F) * 0xA01) >> 8 & 0xFF00FF) * 0x640001) >> 16) * 10 + (_ & 0xF); iptr += 6; } else { x = ((((x & 0xF0F0F0F) * 0xA01) >> 8 & 0xFF00FF) * 0x640001) >> 16; iptr += 5; } } else if ((x | 0x3030) == x) { uint8_t _ = *(iptr + 2); if (_ >= '0') { x = (x & 0xF) * 100 + (x >> 8 & 0xF) * 10 + (_ - '0'); iptr += 4; } else { x = (x & 0xF) * 10 + (x >> 8 & 0xF); iptr += 3; } } else { x &= 0xF; iptr += 2; } X[i] = x; } qsort_<zmm_vector<int32_t>>(X, 0, n - 1, (31 - __builtin_clz(n)) << 1); int64_t p = X[0]; for (int64_t i = 1; i < n; ++i) { int64_t xi = X[i]; if (p != xi) { if (xi - p < z) z = xi - p; p = xi; } } printf("%ld\n", z); return 0; }