/** * date : 2022-02-11 22:59:01 */ #define NDEBUG using namespace std; // intrinstic #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // utility namespace Nyaan { using ll = long long; using i64 = long long; using u64 = unsigned long long; using i128 = __int128_t; using u128 = __uint128_t; template using V = vector; template using VV = vector>; using vi = vector; using vl = vector; using vd = V; using vs = V; using vvi = vector>; using vvl = vector>; template struct P : pair { template P(Args... args) : pair(args...) {} using pair::first; using pair::second; P &operator+=(const P &r) { first += r.first; second += r.second; return *this; } P &operator-=(const P &r) { first -= r.first; second -= r.second; return *this; } P &operator*=(const P &r) { first *= r.first; second *= r.second; return *this; } template P &operator*=(const S &r) { first *= r, second *= r; return *this; } P operator+(const P &r) const { return P(*this) += r; } P operator-(const P &r) const { return P(*this) -= r; } P operator*(const P &r) const { return P(*this) *= r; } template P operator*(const S &r) const { return P(*this) *= r; } P operator-() const { return P{-first, -second}; } }; using pl = P; using pi = P; using vp = V; constexpr int inf = 1001001001; constexpr long long infLL = 4004004004004004004LL; template int sz(const T &t) { return t.size(); } template inline bool amin(T &x, U y) { return (y < x) ? (x = y, true) : false; } template inline bool amax(T &x, U y) { return (x < y) ? (x = y, true) : false; } template inline T Max(const vector &v) { return *max_element(begin(v), end(v)); } template inline T Min(const vector &v) { return *min_element(begin(v), end(v)); } template inline long long Sum(const vector &v) { return accumulate(begin(v), end(v), 0LL); } template int lb(const vector &v, const T &a) { return lower_bound(begin(v), end(v), a) - begin(v); } template int ub(const vector &v, const T &a) { return upper_bound(begin(v), end(v), a) - begin(v); } constexpr long long TEN(int n) { long long ret = 1, x = 10; for (; n; x *= x, n >>= 1) ret *= (n & 1 ? x : 1); return ret; } template pair mkp(const T &t, const U &u) { return make_pair(t, u); } template vector mkrui(const vector &v, bool rev = false) { vector ret(v.size() + 1); if (rev) { for (int i = int(v.size()) - 1; i >= 0; i--) ret[i] = v[i] + ret[i + 1]; } else { for (int i = 0; i < int(v.size()); i++) ret[i + 1] = ret[i] + v[i]; } return ret; }; template vector mkuni(const vector &v) { vector ret(v); sort(ret.begin(), ret.end()); ret.erase(unique(ret.begin(), ret.end()), ret.end()); return ret; } template vector mkord(int N,F f) { vector ord(N); iota(begin(ord), end(ord), 0); sort(begin(ord), end(ord), f); return ord; } template vector mkinv(vector &v) { int max_val = *max_element(begin(v), end(v)); vector inv(max_val + 1, -1); for (int i = 0; i < (int)v.size(); i++) inv[v[i]] = i; return inv; } vector mkiota(int n) { vector ret(n); iota(begin(ret), end(ret), 0); return ret; } template T mkrev(const T &v) { T w{v}; reverse(begin(w), end(w)); return w; } template bool nxp(vector &v) { return next_permutation(begin(v), end(v)); } #define inV(T, v, n) \ vector v(n); \ in(v) #define inVV(T, v, h, w) \ vector> v(h, vector(w)); \ in(v); template using minpq = priority_queue, greater>; // 区間:半開区間 (ng, ok] または [ok, ng) template T binary_search(T ng, T ok, const F& f) { if constexpr (is_integral::value == true) { while (abs(ok - ng) > 1) { T x = (ok + ng) / 2; (f(x) ? ok : ng) = x; } return ok; } else { for (int iter = 0; iter < 60; iter++) { T x = (ok + ng) / 2; (f(x) ? ok : ng) = x; } return ok; } } // 解区間 (l, r) template void ternary_search(T l, T r, const F& f, bool greater = false) { if constexpr (is_integral::value == true) { while (abs(l - r) > 2) { T llr = (l * 2 + r * 1) / 3; T lrr = (l * 1 + r * 2) / 3; bool flag = f(llr) < f(lrr); if (flag != greater) { r = lrr; } else { l = llr; } } return (l + r) / 2; } else { for (int iter = 0; iter < 80; iter++) { T llr = (l * 2 + r * 1) / 3; T lrr = (l * 1 + r * 2) / 3; bool flag = f(llr) < f(lrr); if (flag != greater) { r = lrr; } else { l = llr; } } return (l + r) / 2; } } } // namespace Nyaan // bit operation namespace Nyaan { __attribute__((target("popcnt"))) inline int popcnt(const u64 &a) { return _mm_popcnt_u64(a); } inline int lsb(const u64 &a) { return a ? __builtin_ctzll(a) : 64; } inline int ctz(const u64 &a) { return a ? __builtin_ctzll(a) : 64; } inline int msb(const u64 &a) { return a ? 63 - __builtin_clzll(a) : -1; } template inline int gbit(const T &a, int i) { return (a >> i) & 1; } template inline void sbit(T &a, int i, bool b) { if (gbit(a, i) != b) a ^= T(1) << i; } constexpr long long PW(int n) { return 1LL << n; } constexpr long long MSK(int n) { return (1LL << n) - 1; } } // namespace Nyaan // inout namespace Nyaan { template ostream &operator<<(ostream &os, const pair &p) { os << p.first << " " << p.second; return os; } template istream &operator>>(istream &is, pair &p) { is >> p.first >> p.second; return is; } template ostream &operator<<(ostream &os, const vector &v) { int s = (int)v.size(); for (int i = 0; i < s; i++) os << (i ? " " : "") << v[i]; return os; } template istream &operator>>(istream &is, vector &v) { for (auto &x : v) is >> x; return is; } istream &operator>>(istream &is, __int128_t &x) { string S; is >> S; x = 0; int flag = 0; for (auto &c : S) { if (c == '-') { flag = true; continue; } x *= 10; x += c - '0'; } if (flag) x = -x; return is; } istream &operator>>(istream &is, __uint128_t &x) { string S; is >> S; x = 0; for (auto &c : S) { x *= 10; x += c - '0'; } return is; } ostream &operator<<(ostream &os, __int128_t x) { if (x == 0) return os << 0; if (x < 0) os << '-', x = -x; string S; while (x) S.push_back('0' + x % 10), x /= 10; reverse(begin(S), end(S)); return os << S; } ostream &operator<<(ostream &os, __uint128_t x) { if (x == 0) return os << 0; string S; while (x) S.push_back('0' + x % 10), x /= 10; reverse(begin(S), end(S)); return os << S; } void in() {} template void in(T &t, U &...u) { cin >> t; in(u...); } void out() { cout << "\n"; } template void out(const T &t, const U &...u) { cout << t; if (sizeof...(u)) cout << sep; out(u...); } void outr() {} template void outr(const T &t, const U &...u) { cout << t; outr(u...); } struct IoSetupNya { IoSetupNya() { cin.tie(nullptr); ios::sync_with_stdio(false); cout << fixed << setprecision(15); cerr << fixed << setprecision(7); } } iosetupnya; } // namespace Nyaan // debug namespace DebugImpl { template struct is_specialize : false_type {}; template struct is_specialize< U, typename conditional::type> : true_type {}; template struct is_specialize< U, typename conditional::type> : true_type {}; template struct is_specialize::value, void>> : true_type { }; void dump(const char& t) { cerr << t; } void dump(const string& t) { cerr << t; } void dump(const bool& t) { cerr << (t ? "true" : "false"); } void dump(__int128_t t) { if (t == 0) cerr << 0; if (t < 0) cerr << '-', t = -t; string S; while (t) S.push_back('0' + t % 10), t /= 10; reverse(begin(S), end(S)); cerr << S; } void dump(__uint128_t t) { if (t == 0) cerr << 0; string S; while (t) S.push_back('0' + t % 10), t /= 10; reverse(begin(S), end(S)); cerr << S; } template ::value, nullptr_t> = nullptr> void dump(const U& t) { cerr << t; } template void dump(const T& t, enable_if_t::value>* = nullptr) { string res; if (t == Nyaan::inf) res = "inf"; if constexpr (is_signed::value) { if (t == -Nyaan::inf) res = "-inf"; } if constexpr (sizeof(T) == 8) { if (t == Nyaan::infLL) res = "inf"; if constexpr (is_signed::value) { if (t == -Nyaan::infLL) res = "-inf"; } } if (res.empty()) res = to_string(t); cerr << res; } template void dump(const pair&); template void dump(const pair&); template void dump(const T& t, enable_if_t::value>* = nullptr) { cerr << "[ "; for (auto it = t.begin(); it != t.end();) { dump(*it); cerr << (++it == t.end() ? "" : ", "); } cerr << " ]"; } template void dump(const pair& t) { cerr << "( "; dump(t.first); cerr << ", "; dump(t.second); cerr << " )"; } template void dump(const pair& t) { cerr << "[ "; for (int i = 0; i < t.second; i++) { dump(t.first[i]); cerr << (i == t.second - 1 ? "" : ", "); } cerr << " ]"; } void trace() { cerr << endl; } template void trace(Head&& head, Tail&&... tail) { cerr << " "; dump(head); if (sizeof...(tail) != 0) cerr << ","; trace(forward(tail)...); } } // namespace DebugImpl #ifdef NyaanDebug #define trc(...) \ do { \ cerr << "## " << #__VA_ARGS__ << " = "; \ DebugImpl::trace(__VA_ARGS__); \ } while (0) #else #define trc(...) (void(0)) #endif // macro #define each(x, v) for (auto&& x : v) #define each2(x, y, v) for (auto&& [x, y] : v) #define all(v) (v).begin(), (v).end() #define rep(i, N) for (long long i = 0; i < (long long)(N); i++) #define repr(i, N) for (long long i = (long long)(N)-1; i >= 0; i--) #define rep1(i, N) for (long long i = 1; i <= (long long)(N); i++) #define repr1(i, N) for (long long i = (N); (long long)(i) > 0; i--) #define reg(i, a, b) for (long long i = (a); i < (b); i++) #define regr(i, a, b) for (long long i = (b)-1; i >= (a); i--) #define fi first #define se second #define ini(...) \ int __VA_ARGS__; \ in(__VA_ARGS__) #define inl(...) \ long long __VA_ARGS__; \ in(__VA_ARGS__) #define ins(...) \ string __VA_ARGS__; \ in(__VA_ARGS__) #define in2(s, t) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i]); \ } #define in3(s, t, u) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i], u[i]); \ } #define in4(s, t, u, v) \ for (int i = 0; i < (int)s.size(); i++) { \ in(s[i], t[i], u[i], v[i]); \ } #define die(...) \ do { \ Nyaan::out(__VA_ARGS__); \ return; \ } while (0) namespace Nyaan { void solve(); } int main() { Nyaan::solve(); } // // struct bit_vector { using u32 = uint32_t; using i64 = int64_t; using u64 = uint64_t; static constexpr u32 w = 64; vector block; vector count; u32 n, zeros; inline u32 get(u32 i) const { return u32(block[i / w] >> (i % w)) & 1u; } inline void set(u32 i) { block[i / w] |= 1LL << (i % w); } bit_vector() {} bit_vector(int _n) { init(_n); } __attribute__((optimize("O3,unroll-loops"))) void init(int _n) { n = zeros = _n; block.resize(n / w + 1, 0); count.resize(block.size(), 0); } __attribute__((target("popcnt"))) void build() { for (u32 i = 1; i < block.size(); ++i) count[i] = count[i - 1] + _mm_popcnt_u64(block[i - 1]); zeros = rank0(n); } inline u32 rank0(u32 i) const { return i - rank1(i); } __attribute__((target("bmi2,popcnt"))) inline u32 rank1(u32 i) const { return count[i / w] + _mm_popcnt_u64(_bzhi_u64(block[i / w], i % w)); } }; template struct WaveletMatrix { using u32 = uint32_t; using i64 = int64_t; using u64 = uint64_t; struct BIT { u32 N; vector data; BIT() = default; BIT(int size) { init(size); } void init(int size) { N = size; data.assign(N + 1, 0); } __attribute__((target("bmi"))) void add(u32 k, T x) { for (++k; k <= N; k += _blsi_u32(k)) data[k] += x; } __attribute__((target("bmi"))) T sum(u32 k) const { T ret = T(); for (; k; k = _blsr_u32(k)) ret += data[k]; return ret; } __attribute__((target("bmi"))) T sum(int l, int r) const { T ret = T(); while (l != r) { if (l < r) { ret += data[r]; r = _blsr_u32(r); } else { ret -= data[l]; l = _blsr_u32(l); } } return ret; } }; using P = pair; int n, lg; vector bv; vector bit; vector

ps; vector ys; WaveletMatrix() {} void add_point(S x, S y) { ps.emplace_back(x, y); ys.emplace_back(y); } __attribute__((optimize("O3"))) void build() { sort(begin(ps), end(ps)); ps.erase(unique(begin(ps), end(ps)), end(ps)); n = ps.size(); sort(begin(ys), end(ys)); ys.erase(unique(begin(ys), end(ys)), end(ys)); vector cur(n), nxt(n); for (int i = 0; i < n; ++i) cur[i] = yid(ps[i].second); lg = __lg(max(n, 1)) + 1; bv.assign(lg, n); bit.assign(lg, n); for (int h = lg - 1; h >= 0; --h) { for (int i = 0; i < n; ++i) if ((cur[i] >> h) & 1) bv[h].set(i); bv[h].build(); array it{begin(nxt), begin(nxt) + bv[h].zeros}; for (int i = 0; i < n; ++i) *it[bv[h].get(i)]++ = cur[i]; swap(cur, nxt); } } int xid(S x) const { return lower_bound( begin(ps), end(ps), make_pair(x, S()), [](const P& a, const P& b) { return a.first < b.first; }) - begin(ps); } int yid(S y) const { return lower_bound(begin(ys), end(ys), y) - begin(ys); } void add(S x, S y, T val) { int i = lower_bound(begin(ps), end(ps), P{x, y}) - begin(ps); for (int h = lg - 1; h >= 0; --h) { int i0 = bv[h].rank0(i); if (bv[h].get(i)) i += bv[h].zeros - i0; else i = i0; bit[h].add(i, val); } } T sum(int l, int r, u32 upper) const { T res = 0; for (int h = lg; h--;) { int l0 = bv[h].rank0(l), r0 = bv[h].rank0(r); if ((upper >> h) & 1) { res += bit[h].sum(l0, r0); l += bv[h].zeros - l0; r += bv[h].zeros - r0; } else { l = l0, r = r0; } } return res; } T sum(S lx, S ly, S rx, S ry) const { int l = xid(lx), r = xid(rx); return sum(l, r, yid(ry)) - sum(l, r, yid(ly)); } }; // using namespace Nyaan; void Nyaan::solve() { inl(M, K); vl A(M * K); in(A); vvi v(M); rep(i, M * K) v[A[i]].push_back(i); WaveletMatrix wm; rep(i, M) rep(j, K) { int x = v[i][j]; wm.add_point(x, j * M + i); wm.add_point(x, (j + 1) * M + i); trc(x, j * M + i); } wm.build(); ll ans = 0; ll cur = 0; vi w(M * K); rep(i, M) rep(j, K) w[v[i][j]] = j * M + i; { trc(w); repr(i, M * K) { cur += wm.sum(i + 1, 0, M * K, w[i]); wm.add(i, w[i], 1); trc(i, wm.sum(i + 1, 0, M * K, w[i])); } trc(cur); ans = cur; } rep(base, M) { each(i, v[base]) { cur -= wm.sum(0, w[i] + 1, i, w[i] + M); wm.add(i, w[i], -1); cur += wm.sum(i + 1, w[i] + 1, M * K, w[i] + M); wm.add(i, w[i] + M, 1); } trc(cur); amin(ans, cur); } out(ans); }