#include using namespace std; using ll = long long; using lll = __int128; using ld = long double; using u32 = uint32_t; using u64 = uint64_t; using u128 = unsigned __int128; constexpr int inf = 1 << 30; constexpr ll INF = 1LL << 60; constexpr ll mod = 998244353; constexpr ll MOD = 1000000007; using PL = pair; using ML = map; using SL = set; using QL = queue; using VL = vector; using VVL = vector; template using V = vector; template using VV = vector>; template using VVV = vector>; template using VVVV = vector>; template using VVVVV = vector>; template using VVVVVV = vector>; using VS = vector; using VP = vector; using VB = vector; template using PQ = priority_queue; template using PQG = priority_queue, greater>; #define V(type, name, ...) V name(__VA_ARGS__) #define VV(type, name, a, ...) VV name(a, V(__VA_ARGS__)) #define VVV(type, name, a, b, ...) VVV name(a, VV(b, V(__VA_ARGS__))) #define VVVV(type, name, a, b, c, ...) VVVV name(a, VVV(b, VV(c, V(__VA_ARGS__)))) #define VVVVV(type, name, a, b, c, d, ...) VVVVV name(a, VVVV(b, VVV(c, VV(d, V(__VA_ARGS__))))) #define VVVVVV(type, name, a, b, c, d, e, ...) VVVVVV name(a, VVVVV(b, VVVV(c, VVV(d, VV(e, V(__VA_ARGS__)))))) #define REP1(n) for (ll _ = 0, n_ = (n); _ < n_; ++_) #define REP2(i, n) for (ll i = 0, n_ = (n); i < n_; ++i) #define REP3(i, a, b) for (ll i = (a), b_ = (b); i < b_; ++i) #define REP4(i, a, b, s) for (ll i = (a), b_ = (b), s_ = (s); i < b_; i += s_) #define PER1(n) for (ll _ = ll(n) - 1; _ >= 0; --_) #define PER2(i, n) for (ll i = ll(n) - 1; i >= 0; --i) #define PER3(i, a, b) for (ll i = ll(b) - 1, a_ = (a); i >= a_; --i) #define PER4(i, a, b, s) for (ll i = ll(b) - 1, a_ = (a), s_ = (s); i >= a_; i -= s_) #define OVERLOAD4(a, b, c, d, e, ...) e #define REP(...) OVERLOAD4(__VA_ARGS__, REP4, REP3, REP2, REP1)(__VA_ARGS__) #define PER(...) OVERLOAD4(__VA_ARGS__, PER4, PER3, PER2, PER1)(__VA_ARGS__) #define FOR_SUBSET(s, mask) for (ll s = (mask), mask_ = mask; s >= 0; s = (s ? s - 1 & mask_ : -1)) #define ALL(v) v.begin(), v.end() #define RALL(v) v.rbegin(), v.rend() #define LEN(v) ll(v.size()) #define elif else if template constexpr auto min(const T&... a) { return min(initializer_list>{a...}); } template constexpr auto max(T... a) { return max(initializer_list>{a...}); } template void input(T&... a) { (cin >> ... >> a); } void print() { cout << '\n'; } template void print(const T& a, const Ts&... b) { cout << a; (cout << ... << (cout << ' ', b)); cout << '\n'; } template void printl(const Ts&... a) { (cout << ... << (cout << a, '\n')); } #define INT(...) int __VA_ARGS__; input(__VA_ARGS__) #define LL(...) ll __VA_ARGS__; input(__VA_ARGS__) #define STR(...) string __VA_ARGS__; input(__VA_ARGS__) template void vin(V& v) { REP(i, LEN(v)) cin >> v[i]; } template void vin(V& v, Ts&... vs) { vin(v); vin(vs...); } template void vvin(VV& v) { REP(i, LEN(v)) vin(v[i]); } template void vvin(VV& v, Ts&... vs) { vvin(v); vvin(vs...); } template void vout(V& v) { if (!v.size()) { cout << '\n'; return; } REP(i, LEN(v) - 1) cout << v[i] << ' '; cout << v.back() << '\n'; } template void voutl(V& v) { REP(i, LEN(v)) cout << v[i] << '\n'; } template void vvout(VV& v) { REP(i, LEN(v)) Vout(v[i]); } constexpr int popcnt(int x) { return __builtin_popcount(x); } constexpr int popcnt(u32 x) { return __builtin_popcount(x); } constexpr int popcnt(ll x) { return __builtin_popcountll(x); } constexpr int popcnt(u64 x) { return __builtin_popcountll(x); } constexpr int parity(int x) { return __builtin_parity(x); } constexpr int parity(u32 x) { return __builtin_parity(x); } constexpr int parity(ll x) { return __builtin_parityll(x); } constexpr int parity(u64 x) { return __builtin_parityll(x); } constexpr int topbit(int x) { return (x == 0 ? -1 : 31 - __builtin_clz(x)); } constexpr int topbit(u32 x) { return (x == 0 ? -1 : 31 - __builtin_clz(x)); } constexpr int topbit(ll x) { return (x == 0 ? -1 : 63 - __builtin_clzll(x)); } constexpr int topbit(u64 x) { return (x == 0 ? -1 : 63 - __builtin_clzll(x)); } constexpr int lowbit(int x) { return (x == 0 ? -1 : __builtin_ctz(x)); } constexpr int lowbit(u32 x) { return (x == 0 ? -1 : __builtin_ctz(x)); } constexpr int lowbit(ll x) { return (x == 0 ? -1 : __builtin_ctzll(x)); } constexpr int lowbit(u64 x) { return (x == 0 ? -1 : __builtin_ctzll(x)); } template auto floor(const T a, const S b) { return a / b - (a % b && (a ^ b) < 0); } template auto ceil(const T a, const S b) { return floor(a + b - 1, b); } template auto bmod(const T a, const S b) { return a - floor(a, b) * b; } template T SUM(const V& v) { T res = 0; REP(i, LEN(v)) res += v[i]; return res; } template T MIN(const V& v) { T res = numeric_limits::max(); REP(i, LEN(v)) res = min(res, v[i]); return res; } template T MAX(const V& v) { T res = numeric_limits::min(); REP(i, LEN(v)) res = max(res, v[i]); return res; } #define UNIQUE(v) sort(ALL(v)), v.erase(unique(ALL(v)), v.end()), v.shrink_to_fit() constexpr ll power(ll a, ll b) { ll res = 1; while (b) { if (b & 1) res *= a; a *= a; b >>= 1; } return res; } constexpr ll mpower(ll a, ll b, const ll& m) { ll res = 1; while (b) { if (b & 1) res = res * a % m; a = a * a % m; b >>= 1; } return res; } constexpr ll rootll(ll a, ll b) { ll l = 0, r = 1; while (power(r, b) <= a) r <<= 1; while (r - l > 1) { ll mid = (l + r) / 2; (power(mid, b) <= a ? l : r) = mid; } return l; } inline bool in_field(ll i, ll j, ll h, ll w) { return 0 <= i && i < h && 0 <= j && j < w; } #define fix() cout << fixed << setprecision(16) template void debug(const VS& s, int i, T a) { print(s[i], '=', a); } template void debug(const VS& s, int i, T a, Ts... b) { debug(s, i, a); debug(s, i + 1, b...); } template void debug(string s, Ts... a) { s += ','; VS t; string tmp; REP(i, LEN(s)) { if (s[i] == ',') { t.push_back(tmp); tmp = ""; ++i; } else tmp += s[i]; } debug(t, 0, a...); } #define DEBUG(...) print("=== DEBUG at line", __LINE__, "==="), debug(#__VA_ARGS__, __VA_ARGS__), print("=== end DEBUG at line", __LINE__, "===") template inline bool chmin(T& a, const S& b) { return (a > b ? a = b, true : false); } template inline bool chmax(T& a, const S& b) { return (a < b ? a = b, true : false); } void YES(bool t = true) { print(t ? "YES" : "NO"); } void Yes(bool t = true) { print(t ? "Yes" : "No"); } void yes(bool t = true) { print(t ? "yes" : "no"); } void NO(bool t = true) { YES(!t); } void No(bool t = true) { Yes(!t); } void no(bool t = true) { yes(!t); } int dx[8] = {1, 0, -1, 0, 1, -1, -1, 1}; int dy[8] = {0, 1, 0, -1, 1, 1, -1, -1}; // #pragma once #include #include #include #include #include namespace quick_floyd_warshall { namespace vectorize { // instruction set for vectorization enum class InstSet { DEFAULT, SSE4_2, AVX2, AVX512 }; std::string inst_set_to_str(InstSet inst_set) { if (inst_set == InstSet::DEFAULT) return "DEFAULT"; if (inst_set == InstSet::SSE4_2 ) return "SSE4_2"; if (inst_set == InstSet::AVX2 ) return "AVX2"; if (inst_set == InstSet::AVX512 ) return "AVX512"; return ""; } // wrapper of sse/avx intrinsics template class vector_base_t; template<> class vector_base_t { public: static constexpr int SIZE = 16; using internal_vector_t = __m128i; internal_vector_t vec; vector_base_t () = default; vector_base_t (internal_vector_t vec_) : vec(vec_) {} vector_base_t (void *ptr) : vec(_mm_load_si128((internal_vector_t *) ptr)) {} vector_base_t &store(void *ptr) { _mm_store_si128((internal_vector_t *) ptr, vec); return *this; } }; template class vector_base_t; template<> class vector_base_t { public: static constexpr int SIZE = 32; using internal_vector_t = __m256i; internal_vector_t vec; vector_base_t () = default; vector_base_t (internal_vector_t vec_) : vec(vec_) {} vector_base_t (void *ptr) : vec(_mm256_load_si256((internal_vector_t *) ptr)) {} vector_base_t &store(void *ptr) { _mm256_store_si256((internal_vector_t *) ptr, vec); return *this; } }; template<> class vector_base_t { public: static constexpr int SIZE = 64; using internal_vector_t = __m512i; internal_vector_t vec; vector_base_t () = default; vector_base_t (internal_vector_t vec_) : vec(vec_) {} vector_base_t (void *ptr) : vec(_mm512_load_si512((internal_vector_t *) ptr)) {} vector_base_t &store(void *ptr) { _mm512_store_si512((internal_vector_t *) ptr, vec); return *this; } }; template class vector_t; /* vec.chmin_store(mem): mem[i] = min(mem[i], vec[i]) vec.chmax_store(mem): mem[i] = max(mem[i], vec[i]) */ // DEFAULT / * template class vector_t { static_assert(std::is_same::value || std::is_same::value || std::is_same::value, ""); public: static constexpr int SIZE = sizeof(T); T val; vector_t &store(void *ptr) { *((T *) ptr) = val; return *this; } vector_t (void *val) : val(*((T *)val)) {} vector_t (T val) : val(val) {} vector_t operator + (const vector_t &rhs) const { return { T(val + rhs.val) }; } vector_t operator - (const vector_t &rhs) const { return { T(val - rhs.val) }; } vector_t operator - () const { return { -val }; } friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { std::min(lhs.val, rhs.val) }; } friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { std::max(lhs.val, rhs.val) }; } vector_t &chmin_store(void *ptr) { if (*((T *) ptr) > val) store(ptr); return *this; } vector_t &chmax_store(void *ptr) { if (*((T *) ptr) < val) store(ptr); return *this; } }; // SSE4.2 / int16_t template<> class vector_t : public vector_base_t { public: using vector_base_t::vector_base_t; vector_t (int16_t val) : vector_base_t(_mm_set1_epi16(val)) {} vector_t operator + (const vector_t &rhs) const { return { _mm_add_epi16(vec, rhs.vec) }; } vector_t operator - (const vector_t &rhs) const { return { _mm_sub_epi16(vec, rhs.vec) }; } vector_t operator - () const { return { _mm_sub_epi16(_mm_setzero_si128(), vec) }; } friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm_min_epi16(lhs.vec, rhs.vec) }; } friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm_max_epi16(lhs.vec, rhs.vec) }; } vector_t &chmin_store(void *ptr) { min(*this, vector_t(ptr)).store(ptr); return *this; } vector_t &chmax_store(void *ptr) { max(*this, vector_t(ptr)).store(ptr); return *this; } }; // SSE4.2 / int32_t template<> class vector_t : public vector_base_t { public: using vector_base_t::vector_base_t; vector_t (int32_t val) : vector_base_t(_mm_set1_epi32(val)) {} vector_t operator + (const vector_t &rhs) const { return { _mm_add_epi32(vec, rhs.vec) }; } vector_t operator - (const vector_t &rhs) const { return { _mm_sub_epi32(vec, rhs.vec) }; } vector_t operator - () const { return { _mm_sub_epi32(_mm_setzero_si128(), vec) }; } friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm_min_epi32(lhs.vec, rhs.vec) }; } friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm_max_epi32(lhs.vec, rhs.vec) }; } vector_t &chmin_store(void *ptr) { min(*this, vector_t(ptr)).store(ptr); return *this; } vector_t &chmax_store(void *ptr) { max(*this, vector_t(ptr)).store(ptr); return *this; } }; // SSE4.2 / int64_t template<> class vector_t : public vector_base_t { public: using vector_base_t::vector_base_t; vector_t (int64_t val) : vector_base_t(_mm_set1_epi64((__m64) val)) {} vector_t operator + (const vector_t &rhs) const { return { _mm_add_epi64(vec, rhs.vec) }; } vector_t operator - (const vector_t &rhs) const { return { _mm_sub_epi64(vec, rhs.vec) }; } vector_t operator - () const { return { _mm_sub_epi64(_mm_setzero_si128(), vec) }; } // SSE4 doesn't have _mm_min_epi64 / _mm_max_epi64 friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm_blendv_epi8(lhs.vec, rhs.vec, _mm_cmpgt_epi64(lhs.vec, rhs.vec)) }; } friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm_blendv_epi8(lhs.vec, rhs.vec, _mm_cmpgt_epi64(rhs.vec, lhs.vec)) }; } vector_t &chmin_store(void *ptr) { min(*this, vector_t(ptr)).store(ptr); return *this; } vector_t &chmax_store(void *ptr) { max(*this, vector_t(ptr)).store(ptr); return *this; } }; // AVX2 / int16_t template<> class vector_t : public vector_base_t { public: using vector_base_t::vector_base_t; vector_t (int16_t val) : vector_base_t(_mm256_set1_epi16(val)) {} vector_t operator + (const vector_t &rhs) const { return { _mm256_add_epi16(vec, rhs.vec) }; } vector_t operator - (const vector_t &rhs) const { return { _mm256_sub_epi16(vec, rhs.vec) }; } vector_t operator - () const { return { _mm256_sub_epi16(_mm256_setzero_si256(), vec) }; } friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm256_min_epi16(lhs.vec, rhs.vec) }; } friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm256_max_epi16(lhs.vec, rhs.vec) }; } vector_t &chmin_store(void *ptr) { min(*this, vector_t(ptr)).store(ptr); return *this; } vector_t &chmax_store(void *ptr) { max(*this, vector_t(ptr)).store(ptr); return *this; } }; // AVX2 / int32_t template<> class vector_t : public vector_base_t { public: using vector_base_t::vector_base_t; vector_t (int32_t val) : vector_base_t(_mm256_set1_epi32(val)) {} vector_t operator + (const vector_t &rhs) const { return { _mm256_add_epi32(vec, rhs.vec) }; } vector_t operator - (const vector_t &rhs) const { return { _mm256_sub_epi32(vec, rhs.vec) }; } vector_t operator - () const { return { _mm256_sub_epi32(_mm256_setzero_si256(), vec) }; } friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm256_min_epi32(lhs.vec, rhs.vec) }; } friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm256_max_epi32(lhs.vec, rhs.vec) }; } vector_t &chmin_store(void *ptr) { min(*this, vector_t(ptr)).store(ptr); return *this; } vector_t &chmax_store(void *ptr) { max(*this, vector_t(ptr)).store(ptr); return *this; } }; // AVX2 / int64_t template<> class vector_t : public vector_base_t { public: using vector_base_t::vector_base_t; vector_t (int64_t val) : vector_base_t(_mm256_set1_epi64x(val)) {} vector_t operator + (const vector_t &rhs) const { return { _mm256_add_epi64(vec, rhs.vec) }; } vector_t operator - (const vector_t &rhs) const { return { _mm256_sub_epi64(vec, rhs.vec) }; } vector_t operator - () const { return { _mm256_sub_epi64(_mm256_setzero_si256(), vec) }; } // avx2 doesn't have _mm256_min_epi64 / _mm256_max_epi64 friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm256_blendv_epi8(lhs.vec, rhs.vec, _mm256_cmpgt_epi64(lhs.vec, rhs.vec)) }; } friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm256_blendv_epi8(lhs.vec, rhs.vec, _mm256_cmpgt_epi64(rhs.vec, lhs.vec)) }; } vector_t &chmin_store(void *ptr) { // slower because of separate load instruction in the 1st operand of cmpgt _mm256_maskstore_epi64((long long *) ptr, _mm256_cmpgt_epi64(vector_t(ptr).vec, vec), vec); return *this; } vector_t &chmax_store(void *ptr) { // faster because cmpgt allows memory address as the 2nd operand _mm256_maskstore_epi64((long long *) ptr, _mm256_cmpgt_epi64(vec, vector_t(ptr).vec), vec); return *this; } }; // AVX512 / int16_t template<> class vector_t : public vector_base_t { public: using vector_base_t::vector_base_t; vector_t (int16_t val) : vector_base_t(_mm512_set1_epi16(val)) {} vector_t operator + (const vector_t &rhs) const { return { _mm512_add_epi16(vec, rhs.vec) }; } vector_t operator - (const vector_t &rhs) const { return { _mm512_sub_epi16(vec, rhs.vec) }; } vector_t operator - () const { return { _mm512_sub_epi16(_mm512_setzero_si512(), vec) }; } friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm512_min_epi16(lhs.vec, rhs.vec) }; } friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm512_max_epi16(lhs.vec, rhs.vec) }; } vector_t &chmin_store(void *ptr) { _mm512_mask_storeu_epi16((internal_vector_t *) (ptr), _mm512_cmp_epi16_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_LT), vec); return *this; } vector_t &chmax_store(void *ptr) { _mm512_mask_storeu_epi16((internal_vector_t *) (ptr), _mm512_cmp_epi16_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_GT), vec); return *this; } }; // AVX512 / int32_t template<> class vector_t : public vector_base_t { public: using vector_base_t::vector_base_t; vector_t (int32_t val) : vector_base_t(_mm512_set1_epi32(val)) {} vector_t operator + (const vector_t &rhs) const { return { _mm512_add_epi32(vec, rhs.vec) }; } vector_t operator - (const vector_t &rhs) const { return { _mm512_sub_epi32(vec, rhs.vec) }; } vector_t operator - () const { return { _mm512_sub_epi32(_mm512_setzero_si512(), vec) }; } friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm512_min_epi32(lhs.vec, rhs.vec) }; } friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm512_max_epi32(lhs.vec, rhs.vec) }; } vector_t &chmin_store(void *ptr) { _mm512_mask_store_epi32((internal_vector_t *) (ptr), _mm512_cmp_epi32_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_LT), vec); return *this; } vector_t &chmax_store(void *ptr) { _mm512_mask_store_epi32((internal_vector_t *) (ptr), _mm512_cmp_epi32_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_GT), vec); return *this; } }; // AVX512 / int64_t template<> class vector_t : public vector_base_t { public: using vector_base_t::vector_base_t; vector_t (int64_t val) : vector_base_t(_mm512_set1_epi64(val)) {} vector_t operator + (const vector_t &rhs) const { return { _mm512_add_epi64(vec, rhs.vec) }; } vector_t operator - (const vector_t &rhs) const { return { _mm512_sub_epi64(vec, rhs.vec) }; } vector_t operator - () const { return { _mm512_sub_epi64(_mm512_setzero_si512(), vec) }; } friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm512_min_epi64(lhs.vec, rhs.vec) }; } friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm512_max_epi64(lhs.vec, rhs.vec) }; } vector_t &chmin_store(void *ptr) { _mm512_mask_store_epi64((internal_vector_t *) (ptr), _mm512_cmp_epi64_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_LT), vec); return *this; } vector_t &chmax_store(void *ptr) { _mm512_mask_store_epi64((internal_vector_t *) (ptr), _mm512_cmp_epi64_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_GT), vec); return *this; } }; template vector_t &operator += ( vector_t &lhs, const vector_t &rhs) { lhs = lhs + rhs; return lhs; } template vector_t &operator -= ( vector_t &lhs, const vector_t &rhs) { lhs = lhs - rhs; return lhs; } } // namespace vectorize } // namespace quick_floyd_warshall // #pragma once #include #include #include #include #include #include #include // #include "internal/vectorize.h" namespace quick_floyd_warshall { template struct is_complete : std::false_type {}; template struct is_complete : std::true_type {}; template struct floyd_warshall_naive { using value_t = T; static constexpr T INF = std::numeric_limits::max() / 2; static std::string get_description() { return "naive"; } static void run(int n, const T *input_matrix, T *output_matrix, bool symmetric = false) { (void) symmetric; T *buf = (T *) malloc(n * n * sizeof(T)); memcpy(buf, input_matrix, n * n * sizeof(T)); for (int k = 0; k < n; k++) for (int i = 0; i < n; i++) for (int j = 0; j < n; j++) buf[i * n + j] = std::min(buf[i * n + j], buf[i * n + k] + buf[k * n + j]); memcpy(output_matrix, buf, n * n * sizeof(T)); free(buf); } }; using InstSet = vectorize::InstSet; template struct floyd_warshall { public: static constexpr T INF = std::numeric_limits::max() / 2; using value_t = T; static std::string get_description() { return "opt<" + vectorize::inst_set_to_str(inst_set) + ", " "int" + std::to_string(sizeof(value_t) * 8) + "_t, " + std::to_string(unroll_type) + ">"; } private: static constexpr int B = 64; // block size using vector_t = vectorize::vector_t; static_assert(is_complete::value, "Invalid inst_set or T"); static_assert(B % (vector_t::SIZE / sizeof(T)) == 0, "Invalid B value"); static_assert(unroll_type >= 0 && unroll_type <= 3, "Invalid unroll_type value"); /* MaxPlusMul?(a, b, c) : - [a, a + B * B), [b, b + B * B), [c, c + B * B) must not overlap - equivalent to: for all i, j in [0, B): a[i * B + j] = max(a[i * B + j], max{b[i * B + k] + c[k * B + j] | k in [0, B)}) */ static void MaxPlusMul0(T *a, T *b, T *c) { constexpr int n = B; for (int k = 0; k < n; k += 2) for (int i = 0; i < n; i += 2) { vector_t coef00(b[(i + 0) * n + (k + 0)]); vector_t coef01(b[(i + 0) * n + (k + 1)]); vector_t coef10(b[(i + 1) * n + (k + 0)]); vector_t coef11(b[(i + 1) * n + (k + 1)]); T *aa = a + i * n; T *bb = c + k * n; for (int j = 0; j < n; j += vector_t::SIZE / sizeof(T)) { vector_t t0(bb + j); vector_t t1(bb + n + j); max(t0 + coef00, t1 + coef01).chmax_store(aa + j); max(t0 + coef10, t1 + coef11).chmax_store(aa + n + j); } } } static void MaxPlusMul1(T *a, T *b, T *c) { constexpr int n = B; for (int k = 0; k < n; k += 4) for (int i = 0; i < n; i += 2) { vector_t coef00(b[(i + 0) * n + (k + 0)]); vector_t coef01(b[(i + 0) * n + (k + 1)]); vector_t coef02(b[(i + 0) * n + (k + 2)]); vector_t coef03(b[(i + 0) * n + (k + 3)]); vector_t coef10(b[(i + 1) * n + (k + 0)]); vector_t coef11(b[(i + 1) * n + (k + 1)]); vector_t coef12(b[(i + 1) * n + (k + 2)]); vector_t coef13(b[(i + 1) * n + (k + 3)]); T *aa = a + i * n; T *bb = c + k * n; for (int j = 0; j < n; j += vector_t::SIZE / sizeof(T)) { vector_t t0(bb + j); vector_t t1(bb + n + j); vector_t t2(bb + n + n + j); vector_t t3(bb + n + n + n + j); max(max(t0 + coef00, t1 + coef01), max(t2 + coef02, t3 + coef03)).chmax_store(aa + j); max(max(t0 + coef10, t1 + coef11), max(t2 + coef12, t3 + coef13)).chmax_store(aa + n + j); } } } static void MaxPlusMul2(T *a, T *b, T *c) { constexpr int n = B; for (int k = 0; k < n; k += 2) for (int i = 0; i < n; i += 4) { vector_t coef00(b[(i + 0) * n + (k + 0)]); vector_t coef01(b[(i + 0) * n + (k + 1)]); vector_t coef10(b[(i + 1) * n + (k + 0)]); vector_t coef11(b[(i + 1) * n + (k + 1)]); vector_t coef20(b[(i + 2) * n + (k + 0)]); vector_t coef21(b[(i + 2) * n + (k + 1)]); vector_t coef30(b[(i + 3) * n + (k + 0)]); vector_t coef31(b[(i + 3) * n + (k + 1)]); T *aa = a + i * n; T *bb = c + k * n; for (int j = 0; j < n; j += vector_t::SIZE / sizeof(T)) { vector_t t0(bb + j); vector_t t1(bb + n + j); max(t0 + coef00, t1 + coef01).chmax_store(aa + j); max(t0 + coef10, t1 + coef11).chmax_store(aa + n + j); max(t0 + coef20, t1 + coef21).chmax_store(aa + n + n + j); max(t0 + coef30, t1 + coef31).chmax_store(aa + n + n + n + j); } } } static void MaxPlusMul3(T *a, T *b, T *c) { constexpr int n = B; for (int k = 0; k < n; k += 4) for (int i = 0; i < n; i += 4) { vector_t coef00(b[(i + 0) * n + (k + 0)]); vector_t coef01(b[(i + 0) * n + (k + 1)]); vector_t coef02(b[(i + 0) * n + (k + 2)]); vector_t coef03(b[(i + 0) * n + (k + 3)]); vector_t coef10(b[(i + 1) * n + (k + 0)]); vector_t coef11(b[(i + 1) * n + (k + 1)]); vector_t coef12(b[(i + 1) * n + (k + 2)]); vector_t coef13(b[(i + 1) * n + (k + 3)]); vector_t coef20(b[(i + 2) * n + (k + 0)]); vector_t coef21(b[(i + 2) * n + (k + 1)]); vector_t coef22(b[(i + 2) * n + (k + 2)]); vector_t coef23(b[(i + 2) * n + (k + 3)]); vector_t coef30(b[(i + 3) * n + (k + 0)]); vector_t coef31(b[(i + 3) * n + (k + 1)]); vector_t coef32(b[(i + 3) * n + (k + 2)]); vector_t coef33(b[(i + 3) * n + (k + 3)]); T *aa = a + i * n; T *bb = c + k * n; for (int j = 0; j < n; j += vector_t::SIZE / sizeof(T)) { vector_t t0(bb + j); vector_t t1(bb + n + j); vector_t t2(bb + n + n + j); vector_t t3(bb + n + n + n + j); max(max(t0 + coef00, t1 + coef01), max(t2 + coef02, t3 + coef03)).chmax_store(aa + j); max(max(t0 + coef10, t1 + coef11), max(t2 + coef12, t3 + coef13)).chmax_store(aa + n + j); max(max(t0 + coef20, t1 + coef21), max(t2 + coef22, t3 + coef23)).chmax_store(aa + n + n + j); max(max(t0 + coef30, t1 + coef31), max(t2 + coef32, t3 + coef33)).chmax_store(aa + n + n + n + j); } } } static void FWI(T *a, T *b, T *c) { if (a != b && a != c && b != c) { if (unroll_type == 0) MaxPlusMul0(a, b, c); if (unroll_type == 1) MaxPlusMul1(a, b, c); if (unroll_type == 2) MaxPlusMul2(a, b, c); if (unroll_type == 3) MaxPlusMul3(a, b, c); return; } constexpr int n = B; for (int k = 0; k < n; k++) for (int i = 0; i < n; i++) { vector_t coef(b[i * n + k]); T *aa = a + i * n; T *bb = c + k * n; for (int j = 0; j < n; j += vector_t::SIZE / sizeof(T)) (vector_t(bb + j) + coef).chmax_store(aa + j); } } static void FWR(int n_blocks_power2, int n_blocks, int block_index0, int block_index1, int block_index2, T **block_start, bool symmetric) { if (block_index0 >= n_blocks || block_index1 >= n_blocks || block_index2 >= n_blocks) return; if (n_blocks_power2 == 1) { FWI( block_start[block_index0 * n_blocks + block_index2], block_start[block_index0 * n_blocks + block_index1], block_start[block_index1 * n_blocks + block_index2] ); } else { int half = n_blocks_power2 >> 1; if (!symmetric) { FWR(half, n_blocks, block_index0 , block_index1 , block_index2 , block_start, false); FWR(half, n_blocks, block_index0 , block_index1 , block_index2 + half, block_start, false); FWR(half, n_blocks, block_index0 + half, block_index1 , block_index2 , block_start, false); FWR(half, n_blocks, block_index0 + half, block_index1 , block_index2 + half, block_start, false); FWR(half, n_blocks, block_index0 + half, block_index1 + half, block_index2 + half, block_start, false); FWR(half, n_blocks, block_index0 + half, block_index1 + half, block_index2 , block_start, false); FWR(half, n_blocks, block_index0 , block_index1 + half, block_index2 + half, block_start, false); FWR(half, n_blocks, block_index0 , block_index1 + half, block_index2 , block_start, false); } else { // if symmetric, block_index0 = block_index1 = block_index2 FWR(half, n_blocks, block_index0 , block_index1 , block_index2 , block_start, true); FWR(half, n_blocks, block_index0 , block_index1 , block_index2 + half, block_start, false); transpose_copy(half, n_blocks, block_index0, block_index0 + half, block_start); FWR(half, n_blocks, block_index0 + half, block_index1 , block_index2 + half, block_start, false); FWR(half, n_blocks, block_index0 + half, block_index1 + half, block_index2 + half, block_start, true); FWR(half, n_blocks, block_index0 + half, block_index1 + half, block_index2 , block_start, false); transpose_copy(half, n_blocks, block_index0 + half, block_index0, block_start); FWR(half, n_blocks, block_index0 , block_index1 + half, block_index2 , block_start, false); } } } // copy [block_row_offset:block_row_offset+n)[block_column_offset:block_column_offset+n) to its transposed posititon // anything outside n_blocks * n_blocks blocks is ignored static void transpose_copy(int n, int n_blocks, int block_row_offset, int block_column_offset, T **block_start) { for (int i = block_row_offset; i < block_row_offset + n && i < n_blocks; i++) for (int j = block_column_offset; j < block_column_offset + n && j < n_blocks; j++) { T *src = block_start[i * n_blocks + j]; T *dst = block_start[j * n_blocks + i]; for (int y = 0; y < B; y++) for (int x = 0; x < B; x++) dst[x * B + y] = src[y * B + x]; } } /* rev == false : Copy the n * n elements in src to dst in the order like this(each src[i][j] is a BxB block): dst: src[0][0], src[0][1], src[1][0], src[1][1], src[0][2], src[0][3], src[1][2], src[1][3], src[2][0], src[2][1], src[3][0], src[3][1], src[2][2], src[2][3], src[3][2], src[3][3], src[0][4], src[0][5], src[1][4], src[1][5], ... and returns the pointer to the next element of the last element written in dst. Elements outside the n_blocks * n_blocks blocks will be ignored Elements in dst corresponding to elements outside the src_n * src_n but in n_blocks * n_blocks will be filled block_start[i * n_blocks + j] will point to the starting element in dst corresponding to (i, j) block rev == true : same as rev == false except that the copy direction is reversed and elements in dst where INF would be contained if !rev are untouched This function negates all the element and FWR handles everything with max instead of min. This is because chmax(mem, reg) can be implemented faster than chmin(mem, reg) with avx2+int64_t The cost of negation should be negligible for other combinations, where this trick is irrelevant */ static T *reorder(int src_n, int n_blocks_power2, T *dst_head, T *src, T **block_start, int block_row, int block_column, bool rev) { int n_blocks = (src_n + B - 1) / B; if (block_row >= n_blocks || block_column >= n_blocks) return dst_head; if (n_blocks_power2 == 1) { T *src_base = src + (block_row * B * src_n + block_column * B); for (int i = 0; i < B; i++) { if (block_row * B + i < src_n) { int length = std::min(B, src_n - block_column * B); if (!rev) { for (int j = 0; j < length; j++) dst_head[i * B + j] = -src_base[i * src_n + j]; for (int j = length; j < B; j++) dst_head[i * B + j] = -INF; } else { for (int j = 0; j < length; j++) src_base[i * src_n + j] = -dst_head[i * B + j]; } } else { if (!rev) std::fill(dst_head + i * B, dst_head + (i + 1) * B, -INF); } } block_start[block_row * n_blocks + block_column] = dst_head; return dst_head + B * B; } else { int n_blocks_p2_half = n_blocks_power2 >> 1; // split into 2x2 recursively for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) dst_head = reorder(src_n, n_blocks_p2_half, dst_head, src, block_start, block_row + i * n_blocks_p2_half, block_column + j * n_blocks_p2_half, rev); return dst_head; } } public: static void run(int src_n, const T *input_matrix, T *output_matrix, bool symmetric = false) { assert(0 <= src_n && src_n < 65536); if (src_n == 0) return; int n_blocks = (src_n + B - 1) / B; // number of BxB blocks in a row int n_blocks_power2 = 1; // smallest power of 2 >= src_n / B while (n_blocks_power2 * B < src_n) n_blocks_power2 *= 2; // allocate and align the needed buffers const size_t reordered_needed_size = (B * n_blocks) * (B * n_blocks) * sizeof(T); size_t reordered_buffer_size = reordered_needed_size + 64; void *reordered_org = malloc(reordered_buffer_size); assert(reordered_org); void *reordered = reordered_org; assert(std::align(64, reordered_needed_size, reordered, reordered_buffer_size)); // block_start[i][j] : pointer to the starting element of the (i, j) block in t T **block_start = (T **) malloc(n_blocks * n_blocks * sizeof(T *)); std::fill(block_start, block_start + n_blocks * n_blocks, nullptr); reorder(src_n, n_blocks_power2, (T *) reordered, const_cast(input_matrix), block_start, 0, 0, false); FWR(n_blocks_power2, n_blocks, 0, 0, 0, block_start, symmetric); reorder(src_n, n_blocks_power2, (T *) reordered, output_matrix, block_start, 0, 0, true); free(reordered_org); free(block_start); } }; } // namespace quick_floyd_warshall using namespace quick_floyd_warshall; int main() { cin.tie(nullptr); ios::sync_with_stdio(false); LL(n, m, l); --l; VL t(n); vin(t); using runner = floyd_warshall; vector matrix(n * n, runner::INF); REP(m) { LL(a, b, c); --a; --b; matrix[a * n + b] = matrix[b * n + a] = c; } REP(i, n) matrix[i * (n + 1)] = 0; runner::run(n, matrix.data(), matrix.data(), true); ll ans = INF; REP(i, n) { ll memo = 0; ll x = 0; REP(j, n) { memo += matrix[i * n + j] * t[j]; if (t[j]) chmin(x, matrix[l * n + j] - matrix[i * n + j]); } memo *= 2; chmin(ans, memo + x); } print(ans); }