#include using namespace std; const int maxn=1e6+5; int sp[maxn][20]; template vector suffix_array(int n, const T &s, int char_bound) { vector a(n); if (n == 0) { return a; } if (char_bound != -1) { vector aux(char_bound, 0); for (int i = 0; i < n; i++) { aux[s[i]]++; } int sum = 0; for (int i = 0; i < char_bound; i++) { int add = aux[i]; aux[i] = sum; sum += add; } for (int i = 0; i < n; i++) { a[aux[s[i]]++] = i; } } else { iota(a.begin(), a.end(), 0); sort(a.begin(), a.end(), [&s](int i, int j) { return s[i] < s[j]; }); } vector sorted_by_second(n); vector ptr_group(n); vector new_group(n); vector group(n); group[a[0]] = 0; for (int i = 1; i < n; i++) { group[a[i]] = group[a[i - 1]] + (!(s[a[i]] == s[a[i - 1]])); } int cnt = group[a[n - 1]] + 1; int step = 1; while (cnt < n) { int at = 0; for (int i = n - step; i < n; i++) { sorted_by_second[at++] = i; } for (int i = 0; i < n; i++) { if (a[i] - step >= 0) { sorted_by_second[at++] = a[i] - step; } } for (int i = n - 1; i >= 0; i--) { ptr_group[group[a[i]]] = i; } for (int i = 0; i < n; i++) { int x = sorted_by_second[i]; a[ptr_group[group[x]]++] = x; } new_group[a[0]] = 0; for (int i = 1; i < n; i++) { if (group[a[i]] != group[a[i - 1]]) { new_group[a[i]] = new_group[a[i - 1]] + 1; } else { int pre = (a[i - 1] + step >= n ? -1 : group[a[i - 1] + step]); int cur = (a[i] + step >= n ? -1 : group[a[i] + step]); new_group[a[i]] = new_group[a[i - 1]] + (pre != cur); } } swap(group, new_group); cnt = group[a[n - 1]] + 1; step <<= 1; } return a; } template vector suffix_array(const T &s, int char_bound) { return suffix_array((int) s.size(), s, char_bound); } template vector build_lcp(int n, const T &s, const vector &sa) { assert((int) sa.size() == n); vector pos(n); for (int i = 0; i < n; i++) { pos[sa[i]] = i; } vector lcp(max(n - 1, 0)); int k = 0; for (int i = 0; i < n; i++) { k = max(k - 1, 0); if (pos[i] == n - 1) { k = 0; } else { int j = sa[pos[i] + 1]; while (i + k < n && j + k < n && s[i + k] == s[j + k]) { k++; } lcp[pos[i]] = k; } } return lcp; } template vector build_lcp(const T &s, const vector &sa) { return build_lcp((int) s.size(), s, sa); } typedef long long ll; const int p=998244353; int po(int a,int b) {if(b==0) return 1; if(b==1) return a; if(b%2==0) {int u=po(a,b/2);return (u*1LL*u)%p;} else {int u=po(a,b-1);return (a*1LL*u)%p;}} int inv(int x) {return po(x,p-2);} #pragma GCC target("avx2") #pragma GCC optimize("O3") #pragma GCC optimize("unroll-loops") #include #include #include #include #include #include constexpr std::uint32_t MOD = 998244353, PRIMITIVE_ROOT = 3; const unsigned int MAX_N = 524288; std::uint32_t mod_mul(std::uint32_t a, std::uint32_t b) { return static_cast(a) * b % MOD; } std::uint32_t pow(std::uint32_t a, std::uint32_t b) { std::uint32_t ret = 1; while (b) { if (b & 1) ret = mod_mul(ret, a); a = mod_mul(a, a); b >>= 1; } return ret; } __m256i mod_add_simd(__m256i a, __m256i b) { __m256i c = _mm256_add_epi32(a, b); __m256i d = _mm256_sub_epi32(c, _mm256_set1_epi32(MOD)); __m256i mask = _mm256_cmpgt_epi32(_mm256_set1_epi32(0), d); return _mm256_blendv_epi8(d, c, mask); } __m256i mod_sub_simd(__m256i a, __m256i b) { __m256i c = _mm256_sub_epi32(a, b); __m256i d = _mm256_add_epi32(c, _mm256_set1_epi32(MOD)); __m256i mask = _mm256_cmpgt_epi32(_mm256_set1_epi32(0), c); return _mm256_blendv_epi8(c, d, mask); } __m256i reduce_simd(__m256i a) { __m256i b = _mm256_sub_epi32(a, _mm256_set1_epi32(MOD)); __m256i mask = _mm256_cmpgt_epi32(_mm256_set1_epi32(0), b); return _mm256_blendv_epi8(b, a, mask); } __m256i add_simd(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); } __m256i sub_simd(__m256i a, __m256i b) { __m256i c = _mm256_sub_epi32(a, b); return _mm256_add_epi32(c, _mm256_set1_epi32(MOD)); } __m256i mulhi_simd(__m256i a, __m256i b) { __m256i a0 = _mm256_unpacklo_epi32(a, a); __m256i a1 = _mm256_unpackhi_epi32(a, a); __m256i b0 = _mm256_unpacklo_epi32(b, b); __m256i b1 = _mm256_unpackhi_epi32(b, b); __m256i c0 = _mm256_mul_epu32(a0, b0); __m256i c1 = _mm256_mul_epu32(a1, b1); return (__m256i)_mm256_shuffle_ps((__m256)c0, (__m256)c1, _MM_SHUFFLE(3, 1, 3, 1)); } struct ConstMulSimd { __m256i a; __m256i a_div; explicit ConstMulSimd(std::uint32_t a) : a(_mm256_set1_epi32(a)) { a_div = _mm256_set1_epi32((std::uint64_t(a) << 32) / MOD); } ConstMulSimd(std::uint32_t a0, std::uint32_t a1, std::uint32_t a2, std::uint32_t a3, std::uint32_t a4, std::uint32_t a5, std::uint32_t a6, std::uint32_t a7) { a = _mm256_setr_epi32(a0, a1, a2, a3, a4, a5, a6, a7); std::uint32_t a_div0, a_div1, a_div2, a_div3, a_div4, a_div5, a_div6, a_div7; a_div0 = (std::uint64_t(a0) << 32) / MOD; a_div1 = (std::uint64_t(a1) << 32) / MOD; a_div2 = (std::uint64_t(a2) << 32) / MOD; a_div3 = (std::uint64_t(a3) << 32) / MOD; a_div4 = (std::uint64_t(a4) << 32) / MOD; a_div5 = (std::uint64_t(a5) << 32) / MOD; a_div6 = (std::uint64_t(a6) << 32) / MOD; a_div7 = (std::uint64_t(a7) << 32) / MOD; a_div = _mm256_setr_epi32(a_div0, a_div1, a_div2, a_div3, a_div4, a_div5, a_div6, a_div7); } __m256i mul(__m256i b) { __m256i q = mulhi_simd(a_div, b); __m256i c = _mm256_mullo_epi32(a, b); __m256i qm = _mm256_mullo_epi32(q, _mm256_set1_epi32(MOD)); __m256i r = _mm256_sub_epi32(c, qm); return r; } __m256i mul_mod(__m256i b) { __m256i r = mul(b); return reduce_simd(r); } }; void ntt(unsigned int n, std::uint32_t *a) { std::uint32_t w = pow(PRIMITIVE_ROOT, (MOD - 1) / n); unsigned int m = n; while (m >= 16) { unsigned int mh = m / 2; std::uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8; w0 = 1; w1 = w; w2 = mod_mul(w1, w1); w3 = mod_mul(w2, w1); w4 = mod_mul(w2, w2); w5 = mod_mul(w4, w1); w6 = mod_mul(w4, w2); w7 = mod_mul(w4, w3); w8 = mod_mul(w4, w4); ConstMulSimd wi_mul(w0, w1, w2, w3, w4, w5, w6, w7); for (unsigned int i = 0; i < n; i += m) { std::uint32_t wj = 1; for (unsigned int j = 0; j < mh; j += 8) { unsigned int j0 = i + j, j1 = j0 + mh; __m256i x0 = _mm256_load_si256(reinterpret_cast<__m256i *>(a + j0)); __m256i x1 = _mm256_load_si256(reinterpret_cast<__m256i *>(a + j1)); __m256i y0 = mod_add_simd(x0, x1); __m256i y1 = wi_mul.mul(sub_simd(x0, x1)); ConstMulSimd wj_mul(wj); y1 = wj_mul.mul_mod(y1); _mm256_store_si256(reinterpret_cast<__m256i *>(a + j0), y0); _mm256_store_si256(reinterpret_cast<__m256i *>(a + j1), y1); wj = mod_mul(wj, w8); } } m = mh; w = w2; } std::uint32_t w0, w1, w2, w3; w0 = 1; w1 = w; w2 = mod_mul(w1, w1); w3 = mod_mul(w2, w1); ConstMulSimd mul_1(1, 1, 1, 1, w0, w1, w2, w3), mul_2(1, 1, w0, w2, 1, 1, w0, w2); for (unsigned int i = 0; i < n; i += 8) { __m256i x00, x01, x10, x11, x20, x21, x30, x31, x40, x41, x50, x51, x6; x00 = _mm256_load_si256(reinterpret_cast<__m256i *>(a + i)); x01 = _mm256_permute4x64_epi64(x00, 0b01001110); x10 = add_simd(x00, x01); x11 = sub_simd(x01, x00); x20 = mul_1.mul_mod(_mm256_blend_epi32(x10, x11, 0b11110000)); x21 = _mm256_shuffle_epi32(x20, 0b01001110); x30 = add_simd(x20, x21); x31 = sub_simd(x21, x20); x40 = mul_2.mul_mod(_mm256_blend_epi32(x30, x31, 0b11001100)); x41 = _mm256_shuffle_epi32(x40, 0b10110001); x50 = add_simd(x40, x41); x51 = sub_simd(x41, x40); x6 = reduce_simd(_mm256_blend_epi32(x50, x51, 0b10101010)); _mm256_store_si256(reinterpret_cast<__m256i *>(a + i), x6); } } void intt(unsigned int n, std::uint32_t *a) { std::vector w_vec; std::uint32_t w = pow(PRIMITIVE_ROOT, MOD - 1 - (MOD - 1) / n); for (unsigned int i = n; i >= 16; i >>= 1) { w_vec.push_back(w); w = mod_mul(w, w); } std::uint32_t w0, w1, w2, w3; w0 = 1; w1 = w; w2 = mod_mul(w1, w1); w3 = mod_mul(w2, w1); ConstMulSimd mul_1(1, 1, 1, 1, w0, w1, w2, w3), mul_2(1, 1, w0, w2, 1, 1, w0, w2); for (unsigned int i = 0; i < n; i += 8) { __m256i x00, x01, x10, x11, x20, x21, x30, x31, x40, x41, x50, x51, x6; x00 = _mm256_load_si256(reinterpret_cast<__m256i *>(a + i)); x01 = _mm256_shuffle_epi32(x00, 0b10110001); x10 = add_simd(x00, x01); x11 = sub_simd(x01, x00); x20 = mul_2.mul_mod(_mm256_blend_epi32(x10, x11, 0b10101010)); x21 = _mm256_shuffle_epi32(x20, 0b01001110); x30 = add_simd(x20, x21); x31 = sub_simd(x21, x20); x40 = mul_1.mul_mod(_mm256_blend_epi32(x30, x31, 0b11001100)); x41 = _mm256_permute4x64_epi64(x40, 0b01001110); x50 = add_simd(x40, x41); x51 = sub_simd(x41, x40); x6 = reduce_simd(_mm256_blend_epi32(x50, x51, 0b11110000)); _mm256_store_si256(reinterpret_cast<__m256i *>(a + i), x6); } unsigned int m = 16; while (m <= n) { unsigned int mh = m / 2; std::uint32_t w = w_vec.back(); w_vec.pop_back(); std::uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8; w0 = 1; w1 = w; w2 = mod_mul(w1, w1); w3 = mod_mul(w2, w1); w4 = mod_mul(w2, w2); w5 = mod_mul(w4, w1); w6 = mod_mul(w4, w2); w7 = mod_mul(w4, w3); w8 = mod_mul(w4, w4); ConstMulSimd wi_mul(w0, w1, w2, w3, w4, w5, w6, w7); for (unsigned int i = 0; i < n; i += m) { std::uint32_t wj = 1; for (unsigned int j = 0; j < mh; j += 8) { unsigned int j0 = i + j, j1 = j0 + mh; __m256i x0 = _mm256_load_si256(reinterpret_cast<__m256i *>(a + j0)); __m256i x1 = _mm256_load_si256(reinterpret_cast<__m256i *>(a + j1)); x1 = wi_mul.mul_mod(x1); ConstMulSimd wj_mul(wj); x1 = wj_mul.mul_mod(x1); __m256i y0 = mod_add_simd(x0, x1); __m256i y1 = mod_sub_simd(x0, x1); _mm256_store_si256(reinterpret_cast<__m256i *>(a + j0), y0); _mm256_store_si256(reinterpret_cast<__m256i *>(a + j1), y1); wj = mod_mul(wj, w8); } } m *= 2; } std::uint32_t n_inv = pow(n, MOD - 2); ConstMulSimd mul_n_inv(n_inv); for (unsigned int i = 0; i < n; i += 8) { __m256i x0 = _mm256_load_si256(reinterpret_cast<__m256i *>(a + i)); _mm256_store_si256(reinterpret_cast<__m256i *>(a + i), mul_n_inv.mul_mod(x0)); } } void convolution(unsigned int n, std::uint32_t *a, std::uint32_t *b) { ntt(n, a); ntt(n, b); for (unsigned int i = 0; i < n; i++) { a[i] = mod_mul(a[i], b[i]); } intt(n, a); } struct IO { const static unsigned int MAX_STDIO_SIZE = 10 * 2 * MAX_N + 14; char stdio_buf[MAX_STDIO_SIZE]; std::uint32_t char_arr_to_int(std::uint64_t x) { x = (x >> 8) + x * 10; x = ((x >> 16) & 0x000000FF000000FFull) + (x & 0x000000FF000000FFull) * 100; x = (x >> 32) + x * 10000; return x & 0xFFFFFFFFull; } std::uint64_t int_to_char_arr(std::uint32_t x) { std::uint64_t r = x, q; q = ((r * 109951163) & 0xFFFFFF0000000000ull) >> 40; r = (r << 32) - q * 42949672959999ull; q = ((r * 10486) & 0xFFF00000FFF00000ull) >> 20; r = (r << 16) - q * 6553599ull; q = ((r * 103) & 0xFC00FC00FC00FC00ull) >> 10; r = (r << 8) - q * 2559ull; return r; } std::uint32_t read_int(char *&buf) { std::uint64_t x = *reinterpret_cast(buf); std::uint64_t space = ((x + 0x1010101010101010ull) & 0x4040404040404040) ^ 0x4040404040404040; if (space == 0) { std::uint32_t res = char_arr_to_int(x - 0x3030303030303030ull); if (buf[8] <= ' ') { buf += 9; } else { res = res * 10 + (buf[8] - '0'); buf += 10; } return res; } unsigned int space_pos = __builtin_ffsll(space); std::uint64_t mask = (1ULL << (space_pos - 7)) - 1; std::uint32_t res = char_arr_to_int(((x - 0x3030303030303030ull) & mask) << (71 - space_pos)); buf += (space_pos + 1) >> 3; return res; } void read(unsigned int &n, unsigned int &m, std::uint32_t *a, std::uint32_t *b) { std::fread(stdio_buf, 1, MAX_STDIO_SIZE, stdin); char *buf = stdio_buf; n = read_int(buf); m = read_int(buf); for (unsigned int i = 0; i < n; i++) { a[i] = read_int(buf); } for (unsigned int i = 0; i < m; i++) { b[i] = read_int(buf); } } void write_int(char *&buf, std::uint32_t x) { constexpr std::uint32_t eight_digit = 100000000; if (x >= eight_digit) [[likely]] { std::uint32_t upper = x / eight_digit, lower = x % eight_digit; *buf = '0' + upper; buf++; *reinterpret_cast(buf) = int_to_char_arr(lower) + 0x3030303030303030ull; buf += 8; } else { std::uint64_t char_arr = int_to_char_arr(x); unsigned int padding = __builtin_ctzll(char_arr + (1ull << 63)) >> 3, len = 8 - padding; char_arr += 0x3030303030303030ull; *reinterpret_cast(buf) = char_arr >> (padding << 3); buf += len; } *buf = ' '; buf++; } void write(unsigned int n, std::uint32_t *a) { char *buf = stdio_buf; for (unsigned int i = 0; i < n; i++) { write_int(buf, a[i]); } unsigned int len = buf - stdio_buf; std::fwrite(stdio_buf, 1, len, stdout); } } io; alignas(32) static std::uint32_t v1[MAX_N * 2], v2[MAX_N * 2]; int32_t main() { ios_base::sync_with_stdio(false);cin.tie(0);cout.tie(0); /*string s="abbaaaf"; vector sa=suffix_array(s,256); vector lc=build_lcp(s,sa); for(int i:sa) cout<>t; while(t--) { int n,m;cin>>n>>m; string s1,s2;cin>>s1>>s2; string s0=s1; bool ok[n-m+1];for(int i=0;i<=n-m;++i) ok[i]=true; int h=1;while(h sa=suffix_array(s,256); vector lc=build_lcp(s,sa); vector pos(s.size()); for(int i=0;ij) swap(i,j); if(i==j) return inf; int o=31-__builtin_clz(j-i); return min(sp[i][o],sp[j-(1<bool { if(i==j) return false; bool sw=false; if(i>j) {swap(i,j);sw=true;} { if(j-i v; for(int i=0;i<=n-m;++i) if(ok[i]) v.push_back(i); int pos1=(*min_element(v.begin(),v.end(),cmp)); string ans=s1.substr(0,pos1); ans+=s2; ans+=s1.substr(pos1+m,n-m-pos1); /*if(ans!=stupid) { cout<