|
9 | 9 | "fmt" |
10 | 10 | "strconv" |
11 | 11 | "strings" |
| 12 | + "time" |
12 | 13 |
|
13 | 14 | corev1 "k8s.io/api/core/v1" |
14 | 15 | metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
@@ -172,100 +173,121 @@ func MakePGBackrestLogDir(template *corev1.PodTemplateSpec, |
172 | 173 | // - Renames the data directory as needed to bootstrap the cluster using the restored database. |
173 | 174 | // This ensures compatibility with the "existing" bootstrap method that is included in the |
174 | 175 | // Patroni config when bootstrapping a cluster using an existing data directory. |
175 | | -func RestoreCommand(pgdata, hugePagesSetting, fetchKeyCommand string, tablespaceVolumes []*corev1.PersistentVolumeClaim, args ...string) []string { |
176 | | - |
177 | | - // After pgBackRest restores files, PostgreSQL starts in recovery to finish |
178 | | - // replaying WAL files. "hot_standby" is "on" so we can detect |
179 | | - // when recovery has finished. In that mode, some parameters cannot be |
180 | | - // smaller than they were when PostgreSQL was backed up. Configure them to |
181 | | - // match the values reported by "pg_controldata". Those parameters are also |
182 | | - // written to WAL files and may change during recovery. When they increase, |
183 | | - // PostgreSQL exits and we reconfigure and restart it. |
184 | | - // For PG14, when some parameters from WAL require a restart, the behavior is |
185 | | - // to pause unless a restart is requested. For this edge case, we run a CASE |
186 | | - // query to check |
187 | | - // (a) if the instance is in recovery; |
188 | | - // (b) if so, if the WAL replay is paused; |
189 | | - // (c) if so, to unpause WAL replay, allowing our expected behavior to resume. |
190 | | - // A note on the PostgreSQL code: we cast `pg_catalog.pg_wal_replay_resume()` as text |
191 | | - // because that method returns a void (which is a non-NULL but empty result). When |
192 | | - // that void is cast as a string, it is an '' |
193 | | - // - https://www.postgresql.org/docs/current/hot-standby.html |
194 | | - // - https://www.postgresql.org/docs/current/app-pgcontroldata.html |
| 176 | +func RestoreCommand(pgdata, hugePagesSetting, fetchKeyCommand string, _ []*corev1.PersistentVolumeClaim, args ...string) []string { |
| 177 | + ps := postgres.NewParameterSet() |
| 178 | + ps.Add("data_directory", pgdata) |
| 179 | + ps.Add("huge_pages", hugePagesSetting) |
195 | 180 |
|
196 | | - // The postmaster.pid file is removed, if it exists, before attempting a restore. |
197 | | - // This allows the restore to be tried more than once without the causing an |
198 | | - // error due to the presence of the file in subsequent attempts. |
| 181 | + // Keep history and WAL files until the cluster starts with its normal |
| 182 | + // archiving enabled. |
| 183 | + ps.Add("archive_command", "false -- store WAL files locally for now") |
| 184 | + ps.Add("archive_mode", "on") |
199 | 185 |
|
200 | | - // The 'pg_ctl' timeout is set to a very large value (1 year) to ensure there |
201 | | - // are no timeouts when starting or stopping Postgres. |
202 | | - |
203 | | - tablespaceCmd := "" |
204 | | - for _, tablespaceVolume := range tablespaceVolumes { |
205 | | - tablespaceCmd = tablespaceCmd + fmt.Sprintf( |
206 | | - "\ninstall --directory --mode=0700 '/tablespaces/%s/data'", |
207 | | - tablespaceVolume.Labels[naming.LabelData]) |
208 | | - } |
| 186 | + // Enable "hot_standby" so we can connect to Postgres and observe its |
| 187 | + // progress during recovery. |
| 188 | + ps.Add("hot_standby", "on") |
209 | 189 |
|
210 | | - // If the fetch key command is not empty, save the GUC variable and value |
211 | | - // to a new string. |
212 | | - var ekc string |
213 | 190 | if fetchKeyCommand != "" { |
214 | | - ekc = ` |
215 | | -encryption_key_command = '` + fetchKeyCommand + `'` |
| 191 | + ps.Add("encryption_key_command", fetchKeyCommand) |
216 | 192 | } |
217 | 193 |
|
218 | | - restoreScript := `declare -r pgdata="$1" opts="$2" |
219 | | -install --directory --mode=0700 "${pgdata}"` + tablespaceCmd + ` |
220 | | -rm -f "${pgdata}/postmaster.pid" |
221 | | -bash -xc "pgbackrest restore ${opts}" |
222 | | -rm -f "${pgdata}/patroni.dynamic.json" |
223 | | -export PGDATA="${pgdata}" PGHOST='/tmp' |
224 | | -
|
225 | | -until [[ "${recovery=}" == 'f' ]]; do |
226 | | -if [[ -z "${recovery}" ]]; then |
227 | | -control=$(pg_controldata) |
228 | | -read -r max_conn <<< "${control##*max_connections setting:}" |
229 | | -read -r max_lock <<< "${control##*max_locks_per_xact setting:}" |
230 | | -read -r max_ptxn <<< "${control##*max_prepared_xacts setting:}" |
231 | | -read -r max_work <<< "${control##*max_worker_processes setting:}" |
232 | | -echo > /tmp/pg_hba.restore.conf 'local all "postgres" peer' |
233 | | -cat > /tmp/postgres.restore.conf <<EOF |
234 | | -archive_command = 'false' |
235 | | -archive_mode = 'on' |
236 | | -hba_file = '/tmp/pg_hba.restore.conf' |
237 | | -hot_standby = 'on' |
238 | | -max_connections = '${max_conn}' |
239 | | -max_locks_per_transaction = '${max_lock}' |
240 | | -max_prepared_transactions = '${max_ptxn}' |
241 | | -max_worker_processes = '${max_work}' |
242 | | -unix_socket_directories = '/tmp'` + |
243 | | - // Add the encryption key command setting, if provided. |
244 | | - ekc + ` |
245 | | -huge_pages = ` + hugePagesSetting + ` |
246 | | -EOF |
247 | | -if [[ "$(< "${pgdata}/PG_VERSION")" -ge 12 ]]; then |
248 | | -read -r max_wals <<< "${control##*max_wal_senders setting:}" |
249 | | -echo >> /tmp/postgres.restore.conf "max_wal_senders = '${max_wals}'" |
250 | | -fi |
251 | | -
|
252 | | -read -r stopped <<< "${control##*recovery ending location:}" |
253 | | -pg_ctl start --silent --timeout=31536000 --wait --options='--config-file=/tmp/postgres.restore.conf' || failed=$? |
254 | | -[[ "${started-}" == "${stopped}" && -n "${failed-}" ]] && exit "${failed}" |
255 | | -started="${stopped}" && [[ -n "${failed-}" ]] && failed= && continue |
256 | | -fi |
257 | | -
|
258 | | -recovery=$(psql -Atc "SELECT CASE |
259 | | - WHEN NOT pg_catalog.pg_is_in_recovery() THEN false |
260 | | - WHEN NOT pg_catalog.pg_is_wal_replay_paused() THEN true |
261 | | - ELSE pg_catalog.pg_wal_replay_resume()::text = '' |
262 | | -END recovery" && sleep 1) ||: |
263 | | -done |
264 | | -
|
265 | | -pg_ctl stop --silent --wait --timeout=31536000 |
266 | | -mv "${pgdata}" "${pgdata}_bootstrap"` |
267 | | - |
268 | | - return append([]string{"bash", "-ceu", "--", restoreScript, "-", pgdata}, args...) |
| 194 | + configure := strings.Join([]string{ |
| 195 | + // With "hot_standby" on, some parameters cannot be smaller than they were |
| 196 | + // when Postgres was backed up. Configure these to match values reported by |
| 197 | + // "pg_controldata" before starting Postgres. These parameters are also |
| 198 | + // written to WAL files and may change during recovery. When they increase, |
| 199 | + // Postgres exits and we reconfigure it here. |
| 200 | + // - https://www.postgresql.org/docs/current/app-pgcontroldata.html |
| 201 | + `control=$(LC_ALL=C pg_controldata)`, |
| 202 | + `read -r max_conn <<< "${control##*max_connections setting:}"`, |
| 203 | + `read -r max_lock <<< "${control##*max_locks_per_xact setting:}"`, |
| 204 | + `read -r max_ptxn <<< "${control##*max_prepared_xacts setting:}"`, |
| 205 | + `read -r max_work <<< "${control##*max_worker_processes setting:}"`, |
| 206 | + |
| 207 | + // During recovery, only allow connections over the the domain socket. |
| 208 | + `echo > /tmp/pg_hba.restore.conf 'local all "postgres" peer'`, |
| 209 | + |
| 210 | + // Combine parameters from Go with those detected in Bash. |
| 211 | + `cat > /tmp/postgres.restore.conf <<'EOF'`, ps.String(), `EOF`, |
| 212 | + `cat >> /tmp/postgres.restore.conf <<EOF`, |
| 213 | + `hba_file = '/tmp/pg_hba.restore.conf'`, |
| 214 | + `max_connections = '${max_conn}'`, |
| 215 | + `max_locks_per_transaction = '${max_lock}'`, |
| 216 | + `max_prepared_transactions = '${max_ptxn}'`, |
| 217 | + `max_worker_processes = '${max_work}'`, |
| 218 | + `EOF`, |
| 219 | + |
| 220 | + `version=$(< "${PGDATA}/PG_VERSION")`, |
| 221 | + |
| 222 | + // PostgreSQL v12 introduced the "max_wal_senders" parameter. |
| 223 | + `if [[ "${version}" -ge 12 ]]; then`, |
| 224 | + `read -r max_wals <<< "${control##*max_wal_senders setting:}"`, |
| 225 | + `echo >> /tmp/postgres.restore.conf "max_wal_senders = '${max_wals}'"`, |
| 226 | + `fi`, |
| 227 | + |
| 228 | + // TODO(sockets): PostgreSQL v14 is able to connect over abstract sockets in the network namespace. |
| 229 | + `PGHOST=$([[ "${version}" -ge 14 ]] && echo '/tmp' || echo '/tmp')`, |
| 230 | + `echo >> /tmp/postgres.restore.conf "unix_socket_directories = '${PGHOST}'"`, |
| 231 | + }, "\n") |
| 232 | + |
| 233 | + script := strings.Join([]string{ |
| 234 | + `declare -r PGDATA="$1" opts="$2"; export PGDATA PGHOST`, |
| 235 | + |
| 236 | + // Remove any "postmaster.pid" file leftover from a prior failure. |
| 237 | + `rm -f "${PGDATA}/postmaster.pid"`, |
| 238 | + |
| 239 | + // Run the restore and print its arguments. |
| 240 | + `bash -xc "pgbackrest restore ${opts}"`, |
| 241 | + |
| 242 | + // Ignore any Patroni settings present in the backup. |
| 243 | + `rm -f "${PGDATA}/patroni.dynamic.json"`, |
| 244 | + |
| 245 | + // By default, pg_ctl waits 60 seconds for Postgres to stop or start. |
| 246 | + // We want to be certain when Postgres is running or not, so we use |
| 247 | + // a very large timeout (365 days) to effectively wait forever. With |
| 248 | + // this, the result of "pg_ctl --wait" indicates the state of Postgres. |
| 249 | + // - https://www.postgresql.org/docs/current/app-pg-ctl.html |
| 250 | + fmt.Sprintf(`export PGCTLTIMEOUT=%d`, 365*24*time.Hour/time.Second), |
| 251 | + |
| 252 | + // Configure and start Postgres until we can see that it has finished |
| 253 | + // replaying WAL. |
| 254 | + // |
| 255 | + // PostgreSQL v13 and earlier exit when they need reconfiguration with |
| 256 | + // "hot_standby" on. This can cause pg_ctl to fail, so we compare the |
| 257 | + // LSN from before and after calling it. If the LSN changed, Postgres |
| 258 | + // ran and was able to replay WAL before exiting. In that case, configure |
| 259 | + // Postgres and start it again to see if it can make more progress. |
| 260 | + // |
| 261 | + // If Postgres exits after pg_ctl succeeds, psql returns nothing which |
| 262 | + // resets the "recovering" variable. Configure Postgres and start it again. |
| 263 | + `until [[ "${recovering=}" == 'f' ]]; do`, |
| 264 | + ` if [[ -z "${recovering}" ]]; then`, configure, |
| 265 | + ` read -r stopped <<< "${control##*recovery ending location:}"`, |
| 266 | + ` pg_ctl start --silent --wait --options='-c config_file=/tmp/postgres.restore.conf' || failed=$?`, |
| 267 | + ` [[ "${started-}" == "${stopped}" && -n "${failed-}" ]] && exit "${failed}"`, |
| 268 | + ` started="${stopped}" && [[ -n "${failed-}" ]] && failed= && continue`, |
| 269 | + ` fi`, |
| 270 | + // Ask Postgres if it is still recovering. PostgreSQL v14 pauses when it |
| 271 | + // needs reconfiguration with "hot_standby" on, and resuming replay causes |
| 272 | + // it to exit like prior versions. |
| 273 | + // - https://www.postgresql.org/docs/current/hot-standby.html |
| 274 | + // |
| 275 | + // NOTE: "pg_wal_replay_resume()" returns void which cannot be compared to |
| 276 | + // null. Instead, cast it to text and compare that for a boolean result. |
| 277 | + ` recovering=$(psql -Atc "SELECT CASE`, |
| 278 | + ` WHEN NOT pg_catalog.pg_is_in_recovery() THEN false`, |
| 279 | + ` WHEN NOT pg_catalog.pg_is_wal_replay_paused() THEN true`, |
| 280 | + ` ELSE pg_catalog.pg_wal_replay_resume()::text = ''`, |
| 281 | + ` END" && sleep 1) ||:`, |
| 282 | + `done`, |
| 283 | + |
| 284 | + // Replay is done. Stop Postgres gracefully and move the data directory |
| 285 | + // into position for our Patroni bootstrap method. |
| 286 | + `pg_ctl stop --silent --wait`, |
| 287 | + `mv "${PGDATA}" "${PGDATA}_bootstrap"`, |
| 288 | + }, "\n") |
| 289 | + |
| 290 | + return append([]string{"bash", "-ceu", "--", script, "-", pgdata}, args...) |
269 | 291 | } |
270 | 292 |
|
271 | 293 | // DedicatedSnapshotVolumeRestoreCommand returns the command for performing a pgBackRest delta restore |
|
0 commit comments