#996611 (C++23) No.788 トラックの移動

提出ソース
結果

問題	No.788 トラックの移動
ユーザー	eggkid6
提出日時	2024-07-20 18:09:26
言語	C++23 (gcc 13.3.0 + boost 1.87.0)
結果	TLE
実行時間	-
コード長	34,875 bytes
コンパイル時間	5,027 ms
コンパイル使用メモリ	363,436 KB
実行使用メモリ	72,576 KB
最終ジャッジ日時	2024-07-20 18:09:50
合計ジャッジ時間	19,294 ms
ジャッジサーバーID （参考情報）	judge5 / judge3
このコードへのチャレンジ
（要ログイン）
ファイルパターン	結果
sample	AC * 3
other	AC * 6 WA * 3 TLE * 5
権限があれば一括ダウンロードができます
ソースコード

raw source code
#include <bits/stdc++.h>
using namespace std;
using ll = long long;
using lll = __int128;
using ld = long double;
using u32 = uint32_t;
using u64 = uint64_t;
using u128 = unsigned __int128;
constexpr int inf = 1 << 30;
constexpr ll INF = 1LL << 60;
constexpr ll mod = 998244353;
constexpr ll MOD = 1000000007;
using PL = pair<ll, ll>;
using ML = map<ll, ll>;
using SL = set<ll>;
using QL = queue<ll>;
using VL = vector<ll>;
using VVL = vector<VL>;
template <class T>
using V = vector<T>;
template <class T>
using VV = vector<V<T>>;
template <class T>
using VVV = vector<VV<T>>;
template <class T>
using VVVV = vector<VVV<T>>;
template <class T>
using VVVVV = vector<VVVV<T>>;
template <class T>
using VVVVVV = vector<VVVVV<T>>;
using VS = vector<string>;
using VP = vector<PL>;
using VB = vector<bool>;
template <class T>
using PQ = priority_queue<T>;
template <class T>
using PQG = priority_queue<T, V<T>, greater<T>>;
#define V(type, name, ...) V<type> name(__VA_ARGS__)
#define VV(type, name, a, ...) VV<type> name(a, V<type>(__VA_ARGS__))
#define VVV(type, name, a, b, ...) VVV<type> name(a, VV<type>(b, V<type>(__VA_ARGS__)))
#define VVVV(type, name, a, b, c, ...) VVVV<type> name(a, VVV<type>(b, VV<type>(c, V<type>(__VA_ARGS__))))
#define VVVVV(type, name, a, b, c, d, ...) VVVVV<type> name(a, VVVV<type>(b, VVV<type>(c, VV<type>(d, V<type>(__VA_ARGS__)))))
#define VVVVVV(type, name, a, b, c, d, e, ...) VVVVVV<type> name(a, VVVVV<type>(b, VVVV<type>(c, VVV<type>(d, VV<type>(e, V<type>(__VA_ARGS__))))))
#define REP1(n) for (ll _ = 0, n_ = (n); _ < n_; ++_)
#define REP2(i, n) for (ll i = 0, n_ = (n); i < n_; ++i)
#define REP3(i, a, b) for (ll i = (a), b_ = (b); i < b_; ++i)
#define REP4(i, a, b, s) for (ll i = (a), b_ = (b), s_ = (s); i < b_; i += s_)
#define PER1(n) for (ll _ = ll(n) - 1; _ >= 0; --_)
#define PER2(i, n) for (ll i = ll(n) - 1; i >= 0; --i)
#define PER3(i, a, b) for (ll i = ll(b) - 1, a_ = (a); i >= a_; --i)
#define PER4(i, a, b, s) for (ll i = ll(b) - 1, a_ = (a), s_ = (s); i >= a_; i -= s_)
#define OVERLOAD4(a, b, c, d, e, ...) e
#define REP(...) OVERLOAD4(__VA_ARGS__, REP4, REP3, REP2, REP1)(__VA_ARGS__)
#define PER(...) OVERLOAD4(__VA_ARGS__, PER4, PER3, PER2, PER1)(__VA_ARGS__)
#define FOR_SUBSET(s, mask) for (ll s = (mask), mask_ = mask; s >= 0; s = (s ? s - 1 & mask_ : -1))
#define ALL(v) v.begin(), v.end()
#define RALL(v) v.rbegin(), v.rend()
#define LEN(v) ll(v.size())
#define elif else if
template <class... T>
constexpr auto min(const T&... a) {
    return min(initializer_list<common_type_t<T...>>{a...});
}
template <class... T>
constexpr auto max(T... a) {
    return max(initializer_list<common_type_t<T...>>{a...});
}
template <class... T>
void input(T&... a) {
    (cin >> ... >> a);
}
void print() {
    cout << '\n';
}
template <class T, class... Ts>
void print(const T& a, const Ts&... b) {
    cout << a;
    (cout << ... << (cout << ' ', b));
    cout << '\n';
}
template <class... Ts>
void printl(const Ts&... a) {
    (cout << ... << (cout << a, '\n'));
}
#define INT(...) int __VA_ARGS__; input(__VA_ARGS__)
#define LL(...) ll __VA_ARGS__; input(__VA_ARGS__)
#define STR(...) string __VA_ARGS__; input(__VA_ARGS__)
template <class T>
void vin(V<T>& v) {
    REP(i, LEN(v)) cin >> v[i];
}
template <class T, class... Ts>
void vin(V<T>& v, Ts&... vs) {
    vin(v);
    vin(vs...);
}
template <class T>
void vvin(VV<T>& v) {
    REP(i, LEN(v)) vin(v[i]);
}
template <class T, class... Ts>
void vvin(VV<T>& v, Ts&... vs) {
    vvin(v);
    vvin(vs...);
}
template <class T>
void vout(V<T>& v) {
    if (!v.size()) {
        cout << '\n';
        return;
    }
    REP(i, LEN(v) - 1) cout << v[i] << ' ';
    cout << v.back() << '\n';
}
template <class T>
void voutl(V<T>& v) {
    REP(i, LEN(v)) cout << v[i] << '\n';
}
template <class T>
void vvout(VV<T>& v) {
    REP(i, LEN(v)) Vout(v[i]);
}
constexpr int popcnt(int x) { return __builtin_popcount(x); }
constexpr int popcnt(u32 x) { return __builtin_popcount(x); }
constexpr int popcnt(ll x) { return __builtin_popcountll(x); }
constexpr int popcnt(u64 x) { return __builtin_popcountll(x); }
constexpr int parity(int x) { return __builtin_parity(x); }
constexpr int parity(u32 x) { return __builtin_parity(x); }
constexpr int parity(ll x) { return __builtin_parityll(x); }
constexpr int parity(u64 x) { return __builtin_parityll(x); }
constexpr int topbit(int x) { return (x == 0 ? -1 : 31 - __builtin_clz(x)); }
constexpr int topbit(u32 x) { return (x == 0 ? -1 : 31 - __builtin_clz(x)); }
constexpr int topbit(ll x) { return (x == 0 ? -1 : 63 - __builtin_clzll(x)); }
constexpr int topbit(u64 x) { return (x == 0 ? -1 : 63 - __builtin_clzll(x)); }
constexpr int lowbit(int x) { return (x == 0 ? -1 : __builtin_ctz(x)); }
constexpr int lowbit(u32 x) { return (x == 0 ? -1 : __builtin_ctz(x)); }
constexpr int lowbit(ll x) { return (x == 0 ? -1 : __builtin_ctzll(x)); }
constexpr int lowbit(u64 x) { return (x == 0 ? -1 : __builtin_ctzll(x)); }
template <class T, class S>
auto floor(const T a, const S b) {
    return a / b - (a % b && (a ^ b) < 0);
}
template <class T, class S>
auto ceil(const T a, const S b) {
    return floor(a + b - 1, b);
}
template <class T, class S>
auto bmod(const T a, const S b) {
    return a - floor(a, b) * b;
}
template <class T>
T SUM(const V<T>& v) {
    T res = 0;
    REP(i, LEN(v)) res += v[i];
    return res;
}
template <class T>
T MIN(const V<T>& v) {
    T res = numeric_limits<T>::max();
    REP(i, LEN(v)) res = min(res, v[i]);
    return res;
}
template <class T>
T MAX(const V<T>& v) {
    T res = numeric_limits<T>::min();
    REP(i, LEN(v)) res = max(res, v[i]);
    return res;
}
#define UNIQUE(v) sort(ALL(v)), v.erase(unique(ALL(v)), v.end()), v.shrink_to_fit()
constexpr ll power(ll a, ll b) {
    ll res = 1;
    while (b) {
        if (b & 1) res *= a;
        a *= a;
        b >>= 1;
    }
    return res;
}
constexpr ll mpower(ll a, ll b, const ll& m) {
    ll res = 1;
    while (b) {
        if (b & 1) res = res * a % m;
        a = a * a % m;
        b >>= 1;
    }
    return res;
}
constexpr ll rootll(ll a, ll b) {
    ll l = 0, r = 1;
    while (power(r, b) <= a) r <<= 1;
    while (r - l > 1) {
        ll mid = (l + r) / 2;
        (power(mid, b) <= a ? l : r) = mid;
    }
    return l;
}
inline bool in_field(ll i, ll j, ll h, ll w) {
    return 0 <= i && i < h && 0 <= j && j < w;
}
#define fix() cout << fixed << setprecision(16)
template <class T>
void debug(const VS& s, int i, T a) {
    print(s[i], '=', a);
}
template <class T, class... Ts>
void debug(const VS& s, int i, T a, Ts... b) {
    debug(s, i, a);
    debug(s, i + 1, b...);
}
template <class... Ts>
void debug(string s, Ts... a) {
    s += ',';
    VS t;
    string tmp;
    REP(i, LEN(s)) {
        if (s[i] == ',') {
            t.push_back(tmp);
            tmp = "";
            ++i;
        }
        else tmp += s[i];
    }
    debug(t, 0, a...);
}
#define DEBUG(...) print("=== DEBUG at line", __LINE__, "==="), debug(#__VA_ARGS__, __VA_ARGS__), print("=== end DEBUG at line", __LINE__, "===")
template <class T, class S>
inline bool chmin(T& a, const S& b) {
    return (a > b ? a = b, true : false);
}
template <class T, class S>
inline bool chmax(T& a, const S& b) {
    return (a < b ? a = b, true : false);
}
void YES(bool t = true) {
    print(t ? "YES" : "NO");
}
void Yes(bool t = true) {
    print(t ? "Yes" : "No");
}
void yes(bool t = true) {
    print(t ? "yes" : "no");
}
void NO(bool t = true) { YES(!t); }
void No(bool t = true) { Yes(!t); }
void no(bool t = true) { yes(!t); }
int dx[8] = {1, 0, -1, 0, 1, -1, -1, 1};
int dy[8] = {0, 1, 0, -1, 1, 1, -1, -1};

// #pragma once
#include <cstdint>
#include <type_traits>
#include <string>
#include <algorithm>
#include <immintrin.h>

namespace quick_floyd_warshall {
namespace vectorize {

// instruction set for vectorization
enum class InstSet {
	DEFAULT,
	SSE4_2,
	AVX2,
	AVX512
};
std::string inst_set_to_str(InstSet inst_set) {
	if (inst_set == InstSet::DEFAULT) return "DEFAULT";
	if (inst_set == InstSet::SSE4_2 ) return "SSE4_2";
	if (inst_set == InstSet::AVX2   ) return "AVX2";
	if (inst_set == InstSet::AVX512 ) return "AVX512";
	return "";
}

// wrapper of sse/avx intrinsics
template<InstSet inst_set> class vector_base_t;
template<> class vector_base_t<InstSet::SSE4_2> {
public:
	static constexpr int SIZE = 16;
	using internal_vector_t = __m128i;
	internal_vector_t vec;
	
	vector_base_t ()  = default;
	vector_base_t (internal_vector_t vec_) : vec(vec_) {}
	vector_base_t (void *ptr) : vec(_mm_load_si128((internal_vector_t *) ptr)) {}
	vector_base_t &store(void *ptr) { _mm_store_si128((internal_vector_t *) ptr, vec); return *this; }
};
template<InstSet inst_set> class vector_base_t;
template<> class vector_base_t<InstSet::AVX2> {
public:
	static constexpr int SIZE = 32;
	using internal_vector_t = __m256i;
	internal_vector_t vec;
	
	vector_base_t ()  = default;
	vector_base_t (internal_vector_t vec_) : vec(vec_) {}
	vector_base_t (void *ptr) : vec(_mm256_load_si256((internal_vector_t *) ptr)) {}
	vector_base_t &store(void *ptr) { _mm256_store_si256((internal_vector_t *) ptr, vec); return *this; }
};
template<> class vector_base_t<InstSet::AVX512> {
public:
	static constexpr int SIZE = 64;
	using internal_vector_t = __m512i;
	internal_vector_t vec;
	
	vector_base_t ()  = default;
	vector_base_t (internal_vector_t vec_) : vec(vec_) {}
	vector_base_t (void *ptr) : vec(_mm512_load_si512((internal_vector_t *) ptr)) {}
	vector_base_t &store(void *ptr) { _mm512_store_si512((internal_vector_t *) ptr, vec); return *this; }
};

template<InstSet inst_set, typename T> class vector_t;

/*
	vec.chmin_store(mem): mem[i] = min(mem[i], vec[i])
	vec.chmax_store(mem): mem[i] = max(mem[i], vec[i])
*/


// DEFAULT / *
template<typename T> class vector_t<InstSet::DEFAULT, T> {
	static_assert(std::is_same<T, int16_t>::value || std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value, "");
public:
	static constexpr int SIZE = sizeof(T);
	T val;
	vector_t &store(void *ptr) { *((T *) ptr) = val; return *this; }
	vector_t (void *val) : val(*((T *)val)) {}
	vector_t (T val) : val(val) {}
	vector_t operator + (const vector_t &rhs) const { return { T(val + rhs.val) }; }
	vector_t operator - (const vector_t &rhs) const { return { T(val - rhs.val) }; }
	vector_t operator - () const { return { -val }; }
	friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { std::min(lhs.val, rhs.val) }; }
	friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { std::max(lhs.val, rhs.val) }; }
	vector_t &chmin_store(void *ptr) { if (*((T *) ptr) > val) store(ptr); return *this; }
	vector_t &chmax_store(void *ptr) { if (*((T *) ptr) < val) store(ptr); return *this; }
};



// SSE4.2 / int16_t
template<> class vector_t<InstSet::SSE4_2, int16_t> : public vector_base_t<InstSet::SSE4_2> {
public:
	using vector_base_t<InstSet::SSE4_2>::vector_base_t;
	vector_t (int16_t val) : vector_base_t(_mm_set1_epi16(val)) {}
	vector_t operator + (const vector_t &rhs) const { return { _mm_add_epi16(vec, rhs.vec) }; }
	vector_t operator - (const vector_t &rhs) const { return { _mm_sub_epi16(vec, rhs.vec) }; }
	vector_t operator - () const { return { _mm_sub_epi16(_mm_setzero_si128(), vec) }; }
	friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm_min_epi16(lhs.vec, rhs.vec) }; }
	friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm_max_epi16(lhs.vec, rhs.vec) }; }
	vector_t &chmin_store(void *ptr) { min(*this, vector_t(ptr)).store(ptr); return *this; }
	vector_t &chmax_store(void *ptr) { max(*this, vector_t(ptr)).store(ptr); return *this; }
};
// SSE4.2 / int32_t
template<> class vector_t<InstSet::SSE4_2, int32_t> : public vector_base_t<InstSet::SSE4_2> {
public:
	using vector_base_t<InstSet::SSE4_2>::vector_base_t;
	vector_t (int32_t val) : vector_base_t(_mm_set1_epi32(val)) {}
	vector_t operator + (const vector_t &rhs) const { return { _mm_add_epi32(vec, rhs.vec) }; }
	vector_t operator - (const vector_t &rhs) const { return { _mm_sub_epi32(vec, rhs.vec) }; }
	vector_t operator - () const { return { _mm_sub_epi32(_mm_setzero_si128(), vec) }; }
	friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm_min_epi32(lhs.vec, rhs.vec) }; }
	friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm_max_epi32(lhs.vec, rhs.vec) }; }
	vector_t &chmin_store(void *ptr) { min(*this, vector_t(ptr)).store(ptr); return *this; }
	vector_t &chmax_store(void *ptr) { max(*this, vector_t(ptr)).store(ptr); return *this; }
};
// SSE4.2 / int64_t
template<> class vector_t<InstSet::SSE4_2, int64_t> : public vector_base_t<InstSet::SSE4_2> {
public:
	using vector_base_t<InstSet::SSE4_2>::vector_base_t;
	vector_t (int64_t val) : vector_base_t(_mm_set1_epi64((__m64) val)) {}
	vector_t operator + (const vector_t &rhs) const { return { _mm_add_epi64(vec, rhs.vec) }; }
	vector_t operator - (const vector_t &rhs) const { return { _mm_sub_epi64(vec, rhs.vec) }; }
	vector_t operator - () const { return { _mm_sub_epi64(_mm_setzero_si128(), vec) }; }
	// SSE4 doesn't have _mm_min_epi64 / _mm_max_epi64
	friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return {
		_mm_blendv_epi8(lhs.vec, rhs.vec, _mm_cmpgt_epi64(lhs.vec, rhs.vec))
	}; }
	friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return {
		_mm_blendv_epi8(lhs.vec, rhs.vec, _mm_cmpgt_epi64(rhs.vec, lhs.vec))
	}; }
	vector_t &chmin_store(void *ptr) { min(*this, vector_t(ptr)).store(ptr); return *this; }
	vector_t &chmax_store(void *ptr) { max(*this, vector_t(ptr)).store(ptr); return *this; }
};



// AVX2 / int16_t
template<> class vector_t<InstSet::AVX2, int16_t> : public vector_base_t<InstSet::AVX2> {
public:
	using vector_base_t<InstSet::AVX2>::vector_base_t;
	vector_t (int16_t val) : vector_base_t(_mm256_set1_epi16(val)) {}
	vector_t operator + (const vector_t &rhs) const { return { _mm256_add_epi16(vec, rhs.vec) }; }
	vector_t operator - (const vector_t &rhs) const { return { _mm256_sub_epi16(vec, rhs.vec) }; }
	vector_t operator - () const { return { _mm256_sub_epi16(_mm256_setzero_si256(), vec) }; }
	friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm256_min_epi16(lhs.vec, rhs.vec) }; }
	friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm256_max_epi16(lhs.vec, rhs.vec) }; }
	vector_t &chmin_store(void *ptr) { min(*this, vector_t(ptr)).store(ptr); return *this; }
	vector_t &chmax_store(void *ptr) { max(*this, vector_t(ptr)).store(ptr); return *this; }
};
// AVX2 / int32_t
template<> class vector_t<InstSet::AVX2, int32_t> : public vector_base_t<InstSet::AVX2> {
public:
	using vector_base_t<InstSet::AVX2>::vector_base_t;
	vector_t (int32_t val) : vector_base_t(_mm256_set1_epi32(val)) {}
	vector_t operator + (const vector_t &rhs) const { return { _mm256_add_epi32(vec, rhs.vec) }; }
	vector_t operator - (const vector_t &rhs) const { return { _mm256_sub_epi32(vec, rhs.vec) }; }
	vector_t operator - () const { return { _mm256_sub_epi32(_mm256_setzero_si256(), vec) }; }
	friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm256_min_epi32(lhs.vec, rhs.vec) }; }
	friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm256_max_epi32(lhs.vec, rhs.vec) }; }
	vector_t &chmin_store(void *ptr) { min(*this, vector_t(ptr)).store(ptr); return *this; }
	vector_t &chmax_store(void *ptr) { max(*this, vector_t(ptr)).store(ptr); return *this; }
};
// AVX2 / int64_t
template<> class vector_t<InstSet::AVX2, int64_t> : public vector_base_t<InstSet::AVX2> {
public:
	using vector_base_t<InstSet::AVX2>::vector_base_t;
	vector_t (int64_t val) : vector_base_t(_mm256_set1_epi64x(val)) {}
	vector_t operator + (const vector_t &rhs) const { return { _mm256_add_epi64(vec, rhs.vec) }; }
	vector_t operator - (const vector_t &rhs) const { return { _mm256_sub_epi64(vec, rhs.vec) }; }
	vector_t operator - () const { return { _mm256_sub_epi64(_mm256_setzero_si256(), vec) }; }
	// avx2 doesn't have _mm256_min_epi64 / _mm256_max_epi64
	friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return {
		_mm256_blendv_epi8(lhs.vec, rhs.vec, _mm256_cmpgt_epi64(lhs.vec, rhs.vec))
	}; }
	friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return {
		_mm256_blendv_epi8(lhs.vec, rhs.vec, _mm256_cmpgt_epi64(rhs.vec, lhs.vec))
	}; }
	vector_t &chmin_store(void *ptr) { // slower because of separate load instruction in the 1st operand of cmpgt
		_mm256_maskstore_epi64((long long *) ptr, _mm256_cmpgt_epi64(vector_t(ptr).vec, vec), vec);
		return *this;
	}
	vector_t &chmax_store(void *ptr) { // faster because cmpgt allows memory address as the 2nd operand
		_mm256_maskstore_epi64((long long *) ptr, _mm256_cmpgt_epi64(vec, vector_t(ptr).vec), vec);
		return *this;
	}
};



// AVX512 / int16_t
template<> class vector_t<InstSet::AVX512, int16_t> : public vector_base_t<InstSet::AVX512> {
public:
	using vector_base_t<InstSet::AVX512>::vector_base_t;
	vector_t (int16_t val) : vector_base_t(_mm512_set1_epi16(val)) {}
	vector_t operator + (const vector_t &rhs) const { return { _mm512_add_epi16(vec, rhs.vec) }; }
	vector_t operator - (const vector_t &rhs) const { return { _mm512_sub_epi16(vec, rhs.vec) }; }
	vector_t operator - () const { return { _mm512_sub_epi16(_mm512_setzero_si512(), vec) }; }
	friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm512_min_epi16(lhs.vec, rhs.vec) }; }
	friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm512_max_epi16(lhs.vec, rhs.vec) }; }
	vector_t &chmin_store(void *ptr) {
		_mm512_mask_storeu_epi16((internal_vector_t *) (ptr),
			_mm512_cmp_epi16_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_LT), vec);
		return *this;
	}
	vector_t &chmax_store(void *ptr) {
		_mm512_mask_storeu_epi16((internal_vector_t *) (ptr),
			_mm512_cmp_epi16_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_GT), vec);
		return *this;
	}
};
// AVX512 / int32_t
template<> class vector_t<InstSet::AVX512, int32_t> : public vector_base_t<InstSet::AVX512> {
public:
	using vector_base_t<InstSet::AVX512>::vector_base_t;
	vector_t (int32_t val) : vector_base_t(_mm512_set1_epi32(val)) {}
	vector_t operator + (const vector_t &rhs) const { return { _mm512_add_epi32(vec, rhs.vec) }; }
	vector_t operator - (const vector_t &rhs) const { return { _mm512_sub_epi32(vec, rhs.vec) }; }
	vector_t operator - () const { return { _mm512_sub_epi32(_mm512_setzero_si512(), vec) }; }
	friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm512_min_epi32(lhs.vec, rhs.vec) }; }
	friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm512_max_epi32(lhs.vec, rhs.vec) }; }
	vector_t &chmin_store(void *ptr) {
		_mm512_mask_store_epi32((internal_vector_t *) (ptr),
			_mm512_cmp_epi32_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_LT), vec);
		return *this;
	}
	vector_t &chmax_store(void *ptr) {
		_mm512_mask_store_epi32((internal_vector_t *) (ptr),
			_mm512_cmp_epi32_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_GT), vec);
		return *this;
	}
};
// AVX512 / int64_t
template<> class vector_t<InstSet::AVX512, int64_t> : public vector_base_t<InstSet::AVX512> {
public:
	using vector_base_t<InstSet::AVX512>::vector_base_t;
	vector_t (int64_t val) : vector_base_t(_mm512_set1_epi64(val)) {}
	vector_t operator + (const vector_t &rhs) const { return { _mm512_add_epi64(vec, rhs.vec) }; }
	vector_t operator - (const vector_t &rhs) const { return { _mm512_sub_epi64(vec, rhs.vec) }; }
	vector_t operator - () const { return { _mm512_sub_epi64(_mm512_setzero_si512(), vec) }; }
	friend vector_t min(const vector_t &lhs, const vector_t &rhs) { return { _mm512_min_epi64(lhs.vec, rhs.vec) }; }
	friend vector_t max(const vector_t &lhs, const vector_t &rhs) { return { _mm512_max_epi64(lhs.vec, rhs.vec) }; }
	vector_t &chmin_store(void *ptr) {
		_mm512_mask_store_epi64((internal_vector_t *) (ptr),
			_mm512_cmp_epi64_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_LT), vec);
		return *this;
	}
	vector_t &chmax_store(void *ptr) {
		_mm512_mask_store_epi64((internal_vector_t *) (ptr),
			_mm512_cmp_epi64_mask(vec, _mm512_load_si512((internal_vector_t *) ptr), _MM_CMPINT_GT), vec);
		return *this;
	}
};


template<InstSet inst_set, typename T> vector_t<inst_set, T> &operator += (
	vector_t<inst_set, T> &lhs, const vector_t<inst_set, T> &rhs) {
	lhs = lhs + rhs;
	return lhs;
}
template<InstSet inst_set, typename T> vector_t<inst_set, T> &operator -= (
	vector_t<inst_set, T> &lhs, const vector_t<inst_set, T> &rhs) {
	lhs = lhs - rhs;
	return lhs;
}

} // namespace vectorize
} // namespace quick_floyd_warshall

// #pragma once
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <string>
#include <memory>
#include <type_traits>
#include <limits>
// #include "internal/vectorize.h"

namespace quick_floyd_warshall {

template <class T, class = void> struct is_complete : std::false_type {};
template <class T> struct is_complete<T, decltype(void(sizeof(T)))> : std::true_type {};

template<typename T> struct floyd_warshall_naive {
	using value_t = T;
	static constexpr T INF = std::numeric_limits<T>::max() / 2;
	static std::string get_description() { return "naive<int" + std::to_string(sizeof(value_t) * 8) + "_t>"; }
	static void run(int n, const T *input_matrix, T *output_matrix, bool symmetric = false) {
		(void) symmetric;
		T *buf = (T *) malloc(n * n * sizeof(T));
		memcpy(buf, input_matrix, n * n * sizeof(T));
		for (int k = 0; k < n; k++) for (int i = 0; i < n; i++) for (int j = 0; j < n; j++) 
			buf[i * n + j] = std::min<T>(buf[i * n + j], buf[i * n + k] + buf[k * n + j]);
		memcpy(output_matrix, buf, n * n * sizeof(T));
		free(buf);
	}
};

using InstSet = vectorize::InstSet;

template<InstSet inst_set, typename T, int unroll_type> struct floyd_warshall {
public:
	static constexpr T INF = std::numeric_limits<T>::max() / 2;
	using value_t = T;
	
	static std::string get_description() {
		return "opt<" + vectorize::inst_set_to_str(inst_set) + ", " "int" + std::to_string(sizeof(value_t) * 8) + "_t, "
			+ std::to_string(unroll_type) + ">"; 
	}
private:
	static constexpr int B = 64; // block size
	using vector_t = vectorize::vector_t<inst_set, T>;
	
	static_assert(is_complete<vector_t>::value, "Invalid inst_set or T");
	static_assert(B % (vector_t::SIZE / sizeof(T)) == 0, "Invalid B value");
	static_assert(unroll_type >= 0 && unroll_type <= 3, "Invalid unroll_type value");
	
	/*
		MaxPlusMul?(a, b, c) :
		 - [a, a + B * B), [b, b + B * B), [c, c + B * B) must not overlap
		 - equivalent to:
			for all i, j in [0, B):
				a[i * B + j] = max(a[i * B + j], max{b[i * B + k] + c[k * B + j] | k in [0, B)})
	*/
	static void MaxPlusMul0(T *a, T *b, T *c) {
		constexpr int n = B;
		for (int k = 0; k < n; k += 2) for (int i = 0; i < n; i += 2) {
			vector_t coef00(b[(i + 0) * n + (k + 0)]);
			vector_t coef01(b[(i + 0) * n + (k + 1)]);
			vector_t coef10(b[(i + 1) * n + (k + 0)]);
			vector_t coef11(b[(i + 1) * n + (k + 1)]);
			
			T *aa = a + i * n;
			T *bb = c + k * n;
			for (int j = 0; j < n; j += vector_t::SIZE / sizeof(T)) {
				vector_t t0(bb + j);
				vector_t t1(bb + n + j);
				max(t0 + coef00, t1 + coef01).chmax_store(aa + j);
				max(t0 + coef10, t1 + coef11).chmax_store(aa + n + j);
			}
		}
	}
	static void MaxPlusMul1(T *a, T *b, T *c) {
		constexpr int n = B;
		for (int k = 0; k < n; k += 4) for (int i = 0; i < n; i += 2) {
			vector_t coef00(b[(i + 0) * n + (k + 0)]);
			vector_t coef01(b[(i + 0) * n + (k + 1)]);
			vector_t coef02(b[(i + 0) * n + (k + 2)]);
			vector_t coef03(b[(i + 0) * n + (k + 3)]);
			vector_t coef10(b[(i + 1) * n + (k + 0)]);
			vector_t coef11(b[(i + 1) * n + (k + 1)]);
			vector_t coef12(b[(i + 1) * n + (k + 2)]);
			vector_t coef13(b[(i + 1) * n + (k + 3)]);
			
			T *aa = a + i * n;
			T *bb = c + k * n;
			for (int j = 0; j < n; j += vector_t::SIZE / sizeof(T)) {
				vector_t t0(bb + j);
				vector_t t1(bb + n + j);
				vector_t t2(bb + n + n + j);
				vector_t t3(bb + n + n + n + j);
				max(max(t0 + coef00, t1 + coef01), max(t2 + coef02, t3 + coef03)).chmax_store(aa + j);
				max(max(t0 + coef10, t1 + coef11), max(t2 + coef12, t3 + coef13)).chmax_store(aa + n + j);
			}
		}
	}
	static void MaxPlusMul2(T *a, T *b, T *c) {
		constexpr int n = B;
		for (int k = 0; k < n; k += 2) for (int i = 0; i < n; i += 4) {
			vector_t coef00(b[(i + 0) * n + (k + 0)]);
			vector_t coef01(b[(i + 0) * n + (k + 1)]);
			vector_t coef10(b[(i + 1) * n + (k + 0)]);
			vector_t coef11(b[(i + 1) * n + (k + 1)]);
			vector_t coef20(b[(i + 2) * n + (k + 0)]);
			vector_t coef21(b[(i + 2) * n + (k + 1)]);
			vector_t coef30(b[(i + 3) * n + (k + 0)]);
			vector_t coef31(b[(i + 3) * n + (k + 1)]);
			
			T *aa = a + i * n;
			T *bb = c + k * n;
			for (int j = 0; j < n; j += vector_t::SIZE / sizeof(T)) {
				vector_t t0(bb + j);
				vector_t t1(bb + n + j);
				max(t0 + coef00, t1 + coef01).chmax_store(aa + j);
				max(t0 + coef10, t1 + coef11).chmax_store(aa + n + j);
				max(t0 + coef20, t1 + coef21).chmax_store(aa + n + n + j);
				max(t0 + coef30, t1 + coef31).chmax_store(aa + n + n + n + j);
			}
		}
	}
	static void MaxPlusMul3(T *a, T *b, T *c) {
		constexpr int n = B;
		for (int k = 0; k < n; k += 4) for (int i = 0; i < n; i += 4) {
			vector_t coef00(b[(i + 0) * n + (k + 0)]);
			vector_t coef01(b[(i + 0) * n + (k + 1)]);
			vector_t coef02(b[(i + 0) * n + (k + 2)]);
			vector_t coef03(b[(i + 0) * n + (k + 3)]);
			vector_t coef10(b[(i + 1) * n + (k + 0)]);
			vector_t coef11(b[(i + 1) * n + (k + 1)]);
			vector_t coef12(b[(i + 1) * n + (k + 2)]);
			vector_t coef13(b[(i + 1) * n + (k + 3)]);
			vector_t coef20(b[(i + 2) * n + (k + 0)]);
			vector_t coef21(b[(i + 2) * n + (k + 1)]);
			vector_t coef22(b[(i + 2) * n + (k + 2)]);
			vector_t coef23(b[(i + 2) * n + (k + 3)]);
			vector_t coef30(b[(i + 3) * n + (k + 0)]);
			vector_t coef31(b[(i + 3) * n + (k + 1)]);
			vector_t coef32(b[(i + 3) * n + (k + 2)]);
			vector_t coef33(b[(i + 3) * n + (k + 3)]);
			
			T *aa = a + i * n;
			T *bb = c + k * n;
			for (int j = 0; j < n; j += vector_t::SIZE / sizeof(T)) {
				vector_t t0(bb + j);
				vector_t t1(bb + n + j);
				vector_t t2(bb + n + n + j);
				vector_t t3(bb + n + n + n + j);
				max(max(t0 + coef00, t1 + coef01), max(t2 + coef02, t3 + coef03)).chmax_store(aa + j);
				max(max(t0 + coef10, t1 + coef11), max(t2 + coef12, t3 + coef13)).chmax_store(aa + n + j);
				max(max(t0 + coef20, t1 + coef21), max(t2 + coef22, t3 + coef23)).chmax_store(aa + n + n + j);
				max(max(t0 + coef30, t1 + coef31), max(t2 + coef32, t3 + coef33)).chmax_store(aa + n + n + n + j);
			}
		}
	}
	static void FWI(T *a, T *b, T *c) {
		if (a != b && a != c && b != c) {
			if (unroll_type == 0) MaxPlusMul0(a, b, c);
			if (unroll_type == 1) MaxPlusMul1(a, b, c);
			if (unroll_type == 2) MaxPlusMul2(a, b, c);
			if (unroll_type == 3) MaxPlusMul3(a, b, c);
			return;
		}
		constexpr int n = B;
		for (int k = 0; k < n; k++) for (int i = 0; i < n; i++) {
			vector_t coef(b[i * n + k]);
			
			T *aa = a + i * n;
			T *bb = c + k * n;
			for (int j = 0; j < n; j += vector_t::SIZE / sizeof(T))
				(vector_t(bb + j) + coef).chmax_store(aa + j);
		}
	}
	static void FWR(int n_blocks_power2, int n_blocks, int block_index0, int block_index1, int block_index2,
		T **block_start, bool symmetric) {
		
		if (block_index0 >= n_blocks || block_index1 >= n_blocks || block_index2 >= n_blocks) return;
		if (n_blocks_power2 == 1) {
			FWI(
				block_start[block_index0 * n_blocks + block_index2],
				block_start[block_index0 * n_blocks + block_index1],
				block_start[block_index1 * n_blocks + block_index2]
			);
		} else {
			int half = n_blocks_power2 >> 1;
			if (!symmetric) {
				FWR(half, n_blocks, block_index0       , block_index1       , block_index2       , block_start, false);
				FWR(half, n_blocks, block_index0       , block_index1       , block_index2 + half, block_start, false);
				FWR(half, n_blocks, block_index0 + half, block_index1       , block_index2       , block_start, false);
				FWR(half, n_blocks, block_index0 + half, block_index1       , block_index2 + half, block_start, false);
				FWR(half, n_blocks, block_index0 + half, block_index1 + half, block_index2 + half, block_start, false);
				FWR(half, n_blocks, block_index0 + half, block_index1 + half, block_index2       , block_start, false);
				FWR(half, n_blocks, block_index0       , block_index1 + half, block_index2 + half, block_start, false);
				FWR(half, n_blocks, block_index0       , block_index1 + half, block_index2       , block_start, false);
			} else {
				// if symmetric, block_index0 = block_index1 = block_index2
				FWR(half, n_blocks, block_index0       , block_index1       , block_index2       , block_start, true);
				FWR(half, n_blocks, block_index0       , block_index1       , block_index2 + half, block_start, false);
				transpose_copy(half, n_blocks, block_index0, block_index0 + half, block_start);
				FWR(half, n_blocks, block_index0 + half, block_index1       , block_index2 + half, block_start, false);
				FWR(half, n_blocks, block_index0 + half, block_index1 + half, block_index2 + half, block_start, true);
				FWR(half, n_blocks, block_index0 + half, block_index1 + half, block_index2       , block_start, false);
				transpose_copy(half, n_blocks, block_index0 + half, block_index0, block_start);
				FWR(half, n_blocks, block_index0       , block_index1 + half, block_index2       , block_start, false);
			}
		}
	}
	// copy [block_row_offset:block_row_offset+n)[block_column_offset:block_column_offset+n) to its transposed posititon
	// anything outside n_blocks * n_blocks blocks is ignored
	static void transpose_copy(int n, int n_blocks, int block_row_offset, int block_column_offset, T **block_start) {
		for (int i = block_row_offset; i < block_row_offset + n && i < n_blocks; i++) 
			for (int j = block_column_offset; j < block_column_offset + n && j < n_blocks; j++) {
			
			T *src = block_start[i * n_blocks + j];
			T *dst = block_start[j * n_blocks + i];
			for (int y = 0; y < B; y++) for (int x = 0; x < B; x++) dst[x * B + y] = src[y * B + x];
		}
	}
	/*
		rev == false :
			Copy the n * n elements in src to dst in the order like this(each src[i][j] is a BxB block):
			dst: src[0][0], src[0][1], src[1][0], src[1][1],
				 src[0][2], src[0][3], src[1][2], src[1][3],
				 src[2][0], src[2][1], src[3][0], src[3][1],
				 src[2][2], src[2][3], src[3][2], src[3][3],
				 src[0][4], src[0][5], src[1][4], src[1][5], ...
			and returns the pointer to the next element of the last element written in dst.
			Elements outside the n_blocks * n_blocks blocks will be ignored
			Elements in dst corresponding to elements outside the src_n * src_n but in n_blocks * n_blocks
				will be filled
			block_start[i * n_blocks + j] will point to the starting element in dst corresponding to (i, j) block
		
		rev == true :
			same as rev == false except that the copy direction is reversed and
				elements in dst where INF would be contained if !rev are untouched
		
		This function negates all the element and FWR handles everything with max instead of min.
		This is because chmax(mem, reg) can be implemented faster than chmin(mem, reg) with avx2+int64_t
		The cost of negation should be negligible for other combinations, where this trick is irrelevant
	*/
	static T *reorder(int src_n, int n_blocks_power2, T *dst_head, T *src, T **block_start,
		int block_row, int block_column, bool rev) {
		
		int n_blocks = (src_n + B - 1) / B;
		if (block_row >= n_blocks || block_column >= n_blocks) return dst_head;
		if (n_blocks_power2 == 1) {
			T *src_base = src + (block_row * B * src_n + block_column * B);
			for (int i = 0; i < B; i++) {
				if (block_row * B + i < src_n) {
					int length = std::min(B, src_n - block_column * B);
					if (!rev) {
						for (int j = 0; j < length; j++) dst_head[i * B + j] = -src_base[i * src_n + j];
						for (int j = length; j < B; j++) dst_head[i * B + j] = -INF;
					} else {
						for (int j = 0; j < length; j++) src_base[i * src_n + j] = -dst_head[i * B + j];
					}
				} else {
					if (!rev) std::fill(dst_head + i * B, dst_head + (i + 1) * B, -INF);
				}
			}
			block_start[block_row * n_blocks + block_column] = dst_head;
			return dst_head + B * B;
		} else {
			int n_blocks_p2_half = n_blocks_power2 >> 1;
			// split into 2x2 recursively
			for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++)
				dst_head = reorder(src_n, n_blocks_p2_half, dst_head, src, block_start,
					block_row + i * n_blocks_p2_half, block_column + j * n_blocks_p2_half, rev);
			return dst_head;
		}
	}
public:
	static void run(int src_n, const T *input_matrix, T *output_matrix, bool symmetric = false) {
		assert(0 <= src_n && src_n < 65536);
		if (src_n == 0) return;
		int n_blocks = (src_n + B - 1) / B; // number of BxB blocks in a row
		int n_blocks_power2 = 1; // smallest power of 2 >= src_n / B
		while (n_blocks_power2 * B < src_n) n_blocks_power2 *= 2;
		
		// allocate and align the needed buffers
		const size_t reordered_needed_size = (B * n_blocks) * (B * n_blocks) * sizeof(T);
		size_t reordered_buffer_size = reordered_needed_size + 64;
		void *reordered_org = malloc(reordered_buffer_size);
		assert(reordered_org);
		void *reordered = reordered_org;
		assert(std::align(64, reordered_needed_size, reordered, reordered_buffer_size));
		// block_start[i][j] : pointer to the starting element of the (i, j) block in t 
		T **block_start = (T **) malloc(n_blocks * n_blocks * sizeof(T *));
		std::fill(block_start, block_start + n_blocks * n_blocks, nullptr);
		
		reorder(src_n, n_blocks_power2, (T *) reordered, const_cast<T *>(input_matrix), block_start, 0, 0, false);
		FWR(n_blocks_power2, n_blocks, 0, 0, 0, block_start, symmetric);
		reorder(src_n, n_blocks_power2, (T *) reordered, output_matrix, block_start, 0, 0, true);
		
		free(reordered_org);
		free(block_start);
	}
};

} // namespace quick_floyd_warshall

using namespace quick_floyd_warshall;

int main() {
    cin.tie(nullptr);
    ios::sync_with_stdio(false);
    LL(n, m, l);
    --l;
    VL t(n);
    vin(t);
    using runner = floyd_warshall<InstSet::DEFAULT, int64_t, 3>;
    vector<int64_t> matrix(n * n, runner::INF);
    REP(m) {
        LL(a, b, c);
        --a;
        --b;
        matrix[a * n + b] = matrix[b * n + a] = c;
    }
    REP(i, n) matrix[i * (n + 1)] = 0;
    runner::run(n, matrix.data(), matrix.data(), true);
    ll ans = INF;
    REP(i, n) {
        ll memo = 0;
        ll x = 0;
        REP(j, n) {
            memo += matrix[i * n + j] * t[j];
            if (t[j]) chmin(x, matrix[l * n + j] - matrix[i * n + j]);
        }
        memo *= 2;
        chmin(ans, memo + x);
    }
    print(ans);
}
yukicoder

結果

ソースコード