//#pragma GCC optimize("Ofast") //#pragma GCC optimize("unroll-loops") #include using namespace std; using ll = long long; using ull = unsigned long long; using pii = pair; template using V = vector; template using VV = V>; template V make_vec(size_t a) { return V(a); } template auto make_vec(size_t a, Ts... ts) { return V(ts...))>(a, make_vec(ts...)); } #define pb push_back #define eb emplace_back #define mp make_pair #define fi first #define se second #define rep(i, n) rep2(i, 0, n) #define rep2(i, m, n) for (int i = m; i < (n); i++) #define per(i, b) per2(i, 0, b) #define per2(i, a, b) for (int i = int(b) - 1; i >= int(a); i--) #define ALL(c) (c).begin(), (c).end() #define SZ(x) ((int)(x).size()) constexpr ll TEN(int n) { return (n == 0) ? 1 : 10 * TEN(n - 1); } template void chmin(T& t, const U& u) { if (t > u) t = u; } template void chmax(T& t, const U& u) { if (t < u) t = u; } template void mkuni(vector& v) { sort(ALL(v)); v.erase(unique(ALL(v)), end(v)); } template vector sort_by(const vector& v) { vector res(v.size()); iota(res.begin(), res.end(), 0); sort(res.begin(), res.end(), [&](int i, int j) { return v[i] < v[j]; }); return res; } template istream& operator>>(istream& is, pair& p) { is >> p.first >> p.second; return is; } template ostream& operator<<(ostream& os, const pair& p) { os << "(" << p.first << "," << p.second << ")"; return os; } template istream& operator>>(istream& is, vector& v) { for (auto& x : v) { is >> x; } return is; } template ostream& operator<<(ostream& os, const vector& v) { os << "{"; rep(i, v.size()) { if (i) os << ","; os << v[i]; } os << "}"; return os; } #ifdef LOCAL void debug_out() { cerr << endl; } template void debug_out(Head H, Tail... T) { cerr << " " << H; debug_out(T...); } #define debug(...) \ cerr << __LINE__ << " [" << #__VA_ARGS__ << "]:", debug_out(__VA_ARGS__) #define dump(x) cerr << __LINE__ << " " << #x << " = " << (x) << endl #else #define debug(...) (void(0)) #define dump(x) (void(0)) #endif template void scan(vector& v, T offset = T(0)) { for (auto& x : v) { cin >> x; x += offset; } } template void print(T x, int suc = 1) { cout << x; if (suc == 1) cout << "\n"; else if (suc == 2) cout << " "; } template void print(const vector& v, int suc = 1) { for (int i = 0; i < v.size(); ++i) print(v[i], i == int(v.size()) - 1 ? suc : 2); } struct prepare_io { prepare_io() { cin.tie(nullptr); ios::sync_with_stdio(false); cout << fixed << setprecision(10); } } prep_io; template struct BIT { int n; vector bit; BIT(int _n = 0) : n(_n), bit(n + 1) {} // sum of [0, i), 0 <= i <= n T sum(int i) { T s = 0; while (i > 0) { s += bit[i]; i -= i & -i; } return s; } // 0 <= i < n void add(int i, T x) { ++i; while (i <= n) { bit[i] += x; i += i & -i; } } //[l, r) 0 <= l < r < n T sum(int l, int r) { return sum(r) - sum(l); } // verify!!!! // smallest i, sum(i) >= w, none -> n + 1 int lower_bound(T w) { if (w <= 0) return 0; int x = 0, l = 1; while (l * 2 <= n) l *= 2; for (int k = l; k > 0; k /= 2) { if (x + k <= n && bit[x + k] < w) { w -= bit[x + k]; x += k; } } return x + 1; } }; template ll inversion(const V& vec) { int n = vec.size(); BIT bit(n + 10); ll res = 0; rep(i, n) { res += i - bit.sum(vec[i] + 1); bit.add(vec[i], 1); } return res; } void slv() { int M, K; cin >> M >> K; int N = M * K; V A(N); cin >> A; VV vp(M); rep(i, N) vp[A[i]].pb(i); V to(N); rep(i, M) { rep(j, K) { to[vp[i][j]] = i + j * M; } } debug(to); ll cur = inversion(to); ll ans = cur; debug(cur); auto pos = make_vec(M, K); rep(j, K) { V vec; rep(i, M) { vec.eb(vp[i][j], i); } sort(ALL(vec)); rep(i, M) { pos[vec[i].se][j] = i; } } rep(i, M - 1) { rep(j, K) { ll ch = (M - 1 - pos[i][j]) - pos[i][j]; cur += ch; } debug(i + 1, cur); chmin(ans, cur); } print(ans); } int main() { int cases = 1; // cin >> cases; rep(i, cases) slv(); return 0; }