```go
package main
import (
"bufio"
"fmt"
"os"
)
func main() {
// Use buffered input/output for speed, though N=4000 is small enough for basic IO.
in := bufio.NewReader(os.Stdin)
var n, k int
fmt.Fscan(in, &n, &k)
// If k > n, no subarray of length k can be chosen. Cost is 0 for all arrays.
if k > n {
fmt.Println(0)
return
}
const mod int64 = 998244353
// Precompute powers of k modulo mod
powK := make([]int64, n+1)
powK[0] = 1
k64 := int64(k)
for i := 1; i <= n; i++ {
powK[i] = (powK[i-1] * k64) % mod
}
// h[m] will store the number of arrays of length m such that:
// 1. The array contains no valid subarray of length k (where valid means a permutation of 1..k).
// 2. The suffix of length k-1 consists of distinct elements.
// Note: Indices 0 to n-1 are allocated. We only use indices from k-1 to n-1.
h := make([]int64, n)
// dp[j] stores the number of "bad" arrays (no valid subarray) of the current length
// such that the longest distinct suffix has length j.
// Since the arrays are "bad", they cannot have a distinct suffix of length k (which would be a valid perm).
// So j ranges from 1 to k-1. We use a slice of size k and ignore index 0.
dp := make([]int64, k)
// Base case: arrays of length 1.
// Any array of length 1 is "bad" (since k >= 2) and has a distinct suffix of length 1.
// There are k choices for the single element.
dp[1] = k64
// If k=2, then length 1 arrays have a distinct suffix of length k-1=1.
// So h[1] should be initialized.
if k == 2 {
h[1] = dp[1]
}
newDp := make([]int64, k)
suffixSum := make([]int64, k+1)
// DP to compute h[i] for i from 2 to n-1
for i := 2; i < n; i++ {
// Calculate suffix sums of the previous DP state
// suffixSum[j] = sum(dp[j...k-1])
var currentSum int64 = 0
for j := k - 1; j >= 1; j-- {
currentSum = (currentSum + dp[j])
if currentSum >= mod {
currentSum -= mod
}
suffixSum[j] = currentSum
}
// Calculate new DP values for length i
for j := 1; j < k; j++ {
// Transition 1: Extend a distinct suffix of length j-1 to j.
// This is possible if we pick one of the (k - (j-1)) available distinct values.
// Valid for j > 1.
var term1 int64
if j > 1 {
term1 = dp[j-1] * int64(k-j+1)
term1 %= mod
}
// Transition 2: Reset the distinct suffix length to j.
// This happens if we append a value that is already in the suffix,
// such that the new distinct suffix has length j.
// This sums up contributions from all previous lengths >= j.
term2 := suffixSum[j]
val := term1 + term2
if val >= mod {
val -= mod
}
newDp[j] = val
}
// Update dp array
for j := 1; j < k; j++ {
dp[j] = newDp[j]
}
// If the current length i is at least k-1, we record h[i].
// h[i] corresponds to the count of bad arrays ending with k-1 distinct elements.
if i >= k-1 {
h[i] = dp[k-1]
}
}
// S[i] stores the sum of costs for all arrays of length i.
S := make([]int64, n+1)
// S[0]...S[k-1] are 0 because no valid subarrays can be formed.
for i := k; i <= n; i++ {
var sumCosts int64 = 0
// We iterate over the possible start position of the *first* valid subarray.
// If the first valid subarray is A[j...j+k-1], then:
// 1. A[1...j+k-2] contains no valid subarray.
// 2. A[j...j+k-1] is a permutation.
// This implies A[1...j+k-1] satisfies the conditions counted by h[j+k-2] extended by 1 element
// to complete the permutation. There is exactly 1 way to complete the permutation given the prefix.
// So we use h[j+k-2] ways for the prefix part.
// The remaining suffix has length i - (j+k-1). Let this be 'rem'.
// The contribution is 1 (for the current subarray) + cost of remaining suffix.
// Total ways for suffix is k^rem.
// rem ranges from 0 to i-k.
// Corresponding prefix length m = i - 1 - rem.
// We sum h[m] * (k^rem + S[rem]).
for rem := 0; rem <= i-k; rem++ {
m := i - 1 - rem
ways := h[m]
costSuffix := powK[rem] + S[rem]
if costSuffix >= mod {
costSuffix -= mod
}
term := (ways * costSuffix) % mod
sumCosts = (sumCosts + term)
if sumCosts >= mod {
sumCosts -= mod
}
}
S[i] = sumCosts
}
fmt.Println(S[n])
}
```