#include using namespace atcoder; #include using namespace std; // #include #pragma GCC target("avx2") #pragma GCC optimize("O3") #pragma GCC optimize("unroll-loops") #define rep(i, n) for (int i = 0; i < (int)(n); i++) #define rep2(i,a,b) for (int i = (int)(a); i < (int)(b); i++) #define all(v) v.begin(),v.end() #define inc(x,l,r) ((l)<=(x)&&(x)<(r)) #define Unique(x) sort(all(x)), x.erase(unique(all(x)), x.end()) #define pcnt __builtin_popcountll typedef long long ll; #define int ll using ld = long double; using vi = vector; using vs = vector; using P = pair; using vp = vector

; // using Bint = boost::multiprecision::cpp_int; template bool chmax(T1 &a, const T2 b) {if (a < b) {a = b; return true;} else return false; } template bool chmin(T1 &a, const T2 b) {if (a > b) {a = b; return true;} else return false; } template using priority_queue_greater = priority_queue, greater>; template ostream &operator<< (ostream &os, const pair &p){os << p.first <<" "<> (istream &is, modint1000000007 &m){ll in;is>>in;m=in;return is;} ostream &operator<< (ostream &os, const modint998244353 &m){os << m.val();return os;} istream &operator>> (istream &is, modint998244353 &m){ll in;is>>in;m=in;return is;} template istream &operator>>(istream& is,vector &v){for(T &in:v)is>>in;return is;} template void input(T&... a){(cin>> ... >> a);} #ifdef LOCAL template ostream &operator<<(ostream &os,const vector &v){os<<"\x1b[32m";rep(i,v.size())os< int print(T& a){cout << "\x1b[32m"<< a<< '\n' << "\x1b[0m";return 0;} template int print(const T&a, const Ts&... b){cout << "\x1b[32m" << a;(cout<<...<<(cout<<' ',b));cout<<'\n' << "\x1b[0m";return 0;} #else template ostream &operator<<(ostream &os,const vector &v){rep(i,v.size())os< int print(T& a){cout << a<< '\n';return 0;} template int print(const T&a, const Ts&... b){cout << a;(cout<<...<<(cout<<' ',b));cout<<'\n';return 0;} #endif #define VI(v,n) vi v(n); input(v) #define INT(...) int __VA_ARGS__; input(__VA_ARGS__) #define STR(...) string __VA_ARGS__; input(__VA_ARGS__) #define CHAR(...) char __VA_ARGS__; input(__VA_ARGS__) int sign(ll x){return x>0?1:x<0?-1:0;} ll ceil(ll x,ll y){assert(y!=0);if(sign(x)==sign(y))return (x+y-1)/y;return -((-x/y));} ll floor(ll x,ll y){assert(y!=0);if(sign(x)==sign(y))return x/y;if(y<0)x*=-1,y*=-1;return x/y-(x%y<0);} ll abs(ll x,ll y){return abs(x-y);} ll bit(int n){return 1ll< _fac,_ifac,_inv; void init(){ for(int i=2;i<=MAX_N;i++)_fac[i]=_fac[i-1]*i; _ifac[MAX_N]=1/_fac[MAX_N]; for(int i=MAX_N-1;i>=1;i--)_ifac[i]=_ifac[i+1]*(i+1); for(int i=2;iMAX_N||r>MAX_N)return nCr_naive(n,r);return nPr(n,r)*ifac(r);} mint nHr(int n,int r,int low=0){r-=n*low;if(n==0&&r==0)return 1;if(n<=0||r<0)return 0;return nCr(n+r-1,r);} mint nCr_naive(int n,int r){ chmin(r,n-r); mint res = 1; rep(i,r)res*=(n-i),res*=_inv[r-i]; return res; } } com; signed main() { cin.tie(0); ios_base::sync_with_stdio(false); cout << fixed << setprecision(20); INT(n,p); if(p>n)return print(com.fac(n)-1); vector dp(n+1);//dp[i]:=長さpのサイクルをi個作るとき dp[0] = 1; rep(i,n)if((i+1)*p<=n){ dp[i+1] = dp[i]*com.inv(i+1)*com.nCr(n-p*i,p)*com.fac(p)*com.inv(p); } print(com.fac(n)-accumulate(all(dp),mint(0))); return 0; }