Skip to content

Commit 9d80a0b

Browse files
cbandygkech
authored andcommitted
Tidy the pgBackRest restore command using slices
Having these lines broken into string slices allows for Go comments that explain them without presenting those comments in YAML at runtime. This also: - Uses the postgres.ParameterSet type to accumulate Postgres settings. A new String method renders those values safely for use in postgresql.conf. - Disables localization using LC_ALL=C in calls to pg_controldata before we parse its output. - Removes commands to change permissions on tablespace directories; pgBackRest handles this for us now. - Passes command line parameters to Postgres using "-c" rather than "--" long flags. Both work on Linux, but the former works on all systems. - Explains why we need a large timeout for "pg_ctl --wait" and configures it once using the PGCTLTIMEOUT environment variable.
1 parent 3c48007 commit 9d80a0b

File tree

3 files changed

+134
-88
lines changed

3 files changed

+134
-88
lines changed

internal/pgbackrest/config.go

Lines changed: 110 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"fmt"
1010
"strconv"
1111
"strings"
12+
"time"
1213

1314
corev1 "k8s.io/api/core/v1"
1415
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -172,100 +173,121 @@ func MakePGBackrestLogDir(template *corev1.PodTemplateSpec,
172173
// - Renames the data directory as needed to bootstrap the cluster using the restored database.
173174
// This ensures compatibility with the "existing" bootstrap method that is included in the
174175
// Patroni config when bootstrapping a cluster using an existing data directory.
175-
func RestoreCommand(pgdata, hugePagesSetting, fetchKeyCommand string, tablespaceVolumes []*corev1.PersistentVolumeClaim, args ...string) []string {
176-
177-
// After pgBackRest restores files, PostgreSQL starts in recovery to finish
178-
// replaying WAL files. "hot_standby" is "on" so we can detect
179-
// when recovery has finished. In that mode, some parameters cannot be
180-
// smaller than they were when PostgreSQL was backed up. Configure them to
181-
// match the values reported by "pg_controldata". Those parameters are also
182-
// written to WAL files and may change during recovery. When they increase,
183-
// PostgreSQL exits and we reconfigure and restart it.
184-
// For PG14, when some parameters from WAL require a restart, the behavior is
185-
// to pause unless a restart is requested. For this edge case, we run a CASE
186-
// query to check
187-
// (a) if the instance is in recovery;
188-
// (b) if so, if the WAL replay is paused;
189-
// (c) if so, to unpause WAL replay, allowing our expected behavior to resume.
190-
// A note on the PostgreSQL code: we cast `pg_catalog.pg_wal_replay_resume()` as text
191-
// because that method returns a void (which is a non-NULL but empty result). When
192-
// that void is cast as a string, it is an ''
193-
// - https://www.postgresql.org/docs/current/hot-standby.html
194-
// - https://www.postgresql.org/docs/current/app-pgcontroldata.html
176+
func RestoreCommand(pgdata, hugePagesSetting, fetchKeyCommand string, _ []*corev1.PersistentVolumeClaim, args ...string) []string {
177+
ps := postgres.NewParameterSet()
178+
ps.Add("data_directory", pgdata)
179+
ps.Add("huge_pages", hugePagesSetting)
195180

196-
// The postmaster.pid file is removed, if it exists, before attempting a restore.
197-
// This allows the restore to be tried more than once without the causing an
198-
// error due to the presence of the file in subsequent attempts.
181+
// Keep history and WAL files until the cluster starts with its normal
182+
// archiving enabled.
183+
ps.Add("archive_command", "false -- store WAL files locally for now")
184+
ps.Add("archive_mode", "on")
199185

200-
// The 'pg_ctl' timeout is set to a very large value (1 year) to ensure there
201-
// are no timeouts when starting or stopping Postgres.
202-
203-
tablespaceCmd := ""
204-
for _, tablespaceVolume := range tablespaceVolumes {
205-
tablespaceCmd = tablespaceCmd + fmt.Sprintf(
206-
"\ninstall --directory --mode=0700 '/tablespaces/%s/data'",
207-
tablespaceVolume.Labels[naming.LabelData])
208-
}
186+
// Enable "hot_standby" so we can connect to Postgres and observe its
187+
// progress during recovery.
188+
ps.Add("hot_standby", "on")
209189

210-
// If the fetch key command is not empty, save the GUC variable and value
211-
// to a new string.
212-
var ekc string
213190
if fetchKeyCommand != "" {
214-
ekc = `
215-
encryption_key_command = '` + fetchKeyCommand + `'`
191+
ps.Add("encryption_key_command", fetchKeyCommand)
216192
}
217193

218-
restoreScript := `declare -r pgdata="$1" opts="$2"
219-
install --directory --mode=0700 "${pgdata}"` + tablespaceCmd + `
220-
rm -f "${pgdata}/postmaster.pid"
221-
bash -xc "pgbackrest restore ${opts}"
222-
rm -f "${pgdata}/patroni.dynamic.json"
223-
export PGDATA="${pgdata}" PGHOST='/tmp'
224-
225-
until [[ "${recovery=}" == 'f' ]]; do
226-
if [[ -z "${recovery}" ]]; then
227-
control=$(pg_controldata)
228-
read -r max_conn <<< "${control##*max_connections setting:}"
229-
read -r max_lock <<< "${control##*max_locks_per_xact setting:}"
230-
read -r max_ptxn <<< "${control##*max_prepared_xacts setting:}"
231-
read -r max_work <<< "${control##*max_worker_processes setting:}"
232-
echo > /tmp/pg_hba.restore.conf 'local all "postgres" peer'
233-
cat > /tmp/postgres.restore.conf <<EOF
234-
archive_command = 'false'
235-
archive_mode = 'on'
236-
hba_file = '/tmp/pg_hba.restore.conf'
237-
hot_standby = 'on'
238-
max_connections = '${max_conn}'
239-
max_locks_per_transaction = '${max_lock}'
240-
max_prepared_transactions = '${max_ptxn}'
241-
max_worker_processes = '${max_work}'
242-
unix_socket_directories = '/tmp'` +
243-
// Add the encryption key command setting, if provided.
244-
ekc + `
245-
huge_pages = ` + hugePagesSetting + `
246-
EOF
247-
if [[ "$(< "${pgdata}/PG_VERSION")" -ge 12 ]]; then
248-
read -r max_wals <<< "${control##*max_wal_senders setting:}"
249-
echo >> /tmp/postgres.restore.conf "max_wal_senders = '${max_wals}'"
250-
fi
251-
252-
read -r stopped <<< "${control##*recovery ending location:}"
253-
pg_ctl start --silent --timeout=31536000 --wait --options='--config-file=/tmp/postgres.restore.conf' || failed=$?
254-
[[ "${started-}" == "${stopped}" && -n "${failed-}" ]] && exit "${failed}"
255-
started="${stopped}" && [[ -n "${failed-}" ]] && failed= && continue
256-
fi
257-
258-
recovery=$(psql -Atc "SELECT CASE
259-
WHEN NOT pg_catalog.pg_is_in_recovery() THEN false
260-
WHEN NOT pg_catalog.pg_is_wal_replay_paused() THEN true
261-
ELSE pg_catalog.pg_wal_replay_resume()::text = ''
262-
END recovery" && sleep 1) ||:
263-
done
264-
265-
pg_ctl stop --silent --wait --timeout=31536000
266-
mv "${pgdata}" "${pgdata}_bootstrap"`
267-
268-
return append([]string{"bash", "-ceu", "--", restoreScript, "-", pgdata}, args...)
194+
configure := strings.Join([]string{
195+
// With "hot_standby" on, some parameters cannot be smaller than they were
196+
// when Postgres was backed up. Configure these to match values reported by
197+
// "pg_controldata" before starting Postgres. These parameters are also
198+
// written to WAL files and may change during recovery. When they increase,
199+
// Postgres exits and we reconfigure it here.
200+
// - https://www.postgresql.org/docs/current/app-pgcontroldata.html
201+
`control=$(LC_ALL=C pg_controldata)`,
202+
`read -r max_conn <<< "${control##*max_connections setting:}"`,
203+
`read -r max_lock <<< "${control##*max_locks_per_xact setting:}"`,
204+
`read -r max_ptxn <<< "${control##*max_prepared_xacts setting:}"`,
205+
`read -r max_work <<< "${control##*max_worker_processes setting:}"`,
206+
207+
// During recovery, only allow connections over the the domain socket.
208+
`echo > /tmp/pg_hba.restore.conf 'local all "postgres" peer'`,
209+
210+
// Combine parameters from Go with those detected in Bash.
211+
`cat > /tmp/postgres.restore.conf <<'EOF'`, ps.String(), `EOF`,
212+
`cat >> /tmp/postgres.restore.conf <<EOF`,
213+
`hba_file = '/tmp/pg_hba.restore.conf'`,
214+
`max_connections = '${max_conn}'`,
215+
`max_locks_per_transaction = '${max_lock}'`,
216+
`max_prepared_transactions = '${max_ptxn}'`,
217+
`max_worker_processes = '${max_work}'`,
218+
`EOF`,
219+
220+
`version=$(< "${PGDATA}/PG_VERSION")`,
221+
222+
// PostgreSQL v12 introduced the "max_wal_senders" parameter.
223+
`if [[ "${version}" -ge 12 ]]; then`,
224+
`read -r max_wals <<< "${control##*max_wal_senders setting:}"`,
225+
`echo >> /tmp/postgres.restore.conf "max_wal_senders = '${max_wals}'"`,
226+
`fi`,
227+
228+
// TODO(sockets): PostgreSQL v14 is able to connect over abstract sockets in the network namespace.
229+
`PGHOST=$([[ "${version}" -ge 14 ]] && echo '/tmp' || echo '/tmp')`,
230+
`echo >> /tmp/postgres.restore.conf "unix_socket_directories = '${PGHOST}'"`,
231+
}, "\n")
232+
233+
script := strings.Join([]string{
234+
`declare -r PGDATA="$1" opts="$2"; export PGDATA PGHOST`,
235+
236+
// Remove any "postmaster.pid" file leftover from a prior failure.
237+
`rm -f "${PGDATA}/postmaster.pid"`,
238+
239+
// Run the restore and print its arguments.
240+
`bash -xc "pgbackrest restore ${opts}"`,
241+
242+
// Ignore any Patroni settings present in the backup.
243+
`rm -f "${PGDATA}/patroni.dynamic.json"`,
244+
245+
// By default, pg_ctl waits 60 seconds for Postgres to stop or start.
246+
// We want to be certain when Postgres is running or not, so we use
247+
// a very large timeout (365 days) to effectively wait forever. With
248+
// this, the result of "pg_ctl --wait" indicates the state of Postgres.
249+
// - https://www.postgresql.org/docs/current/app-pg-ctl.html
250+
fmt.Sprintf(`export PGCTLTIMEOUT=%d`, 365*24*time.Hour/time.Second),
251+
252+
// Configure and start Postgres until we can see that it has finished
253+
// replaying WAL.
254+
//
255+
// PostgreSQL v13 and earlier exit when they need reconfiguration with
256+
// "hot_standby" on. This can cause pg_ctl to fail, so we compare the
257+
// LSN from before and after calling it. If the LSN changed, Postgres
258+
// ran and was able to replay WAL before exiting. In that case, configure
259+
// Postgres and start it again to see if it can make more progress.
260+
//
261+
// If Postgres exits after pg_ctl succeeds, psql returns nothing which
262+
// resets the "recovering" variable. Configure Postgres and start it again.
263+
`until [[ "${recovering=}" == 'f' ]]; do`,
264+
` if [[ -z "${recovering}" ]]; then`, configure,
265+
` read -r stopped <<< "${control##*recovery ending location:}"`,
266+
` pg_ctl start --silent --wait --options='-c config_file=/tmp/postgres.restore.conf' || failed=$?`,
267+
` [[ "${started-}" == "${stopped}" && -n "${failed-}" ]] && exit "${failed}"`,
268+
` started="${stopped}" && [[ -n "${failed-}" ]] && failed= && continue`,
269+
` fi`,
270+
// Ask Postgres if it is still recovering. PostgreSQL v14 pauses when it
271+
// needs reconfiguration with "hot_standby" on, and resuming replay causes
272+
// it to exit like prior versions.
273+
// - https://www.postgresql.org/docs/current/hot-standby.html
274+
//
275+
// NOTE: "pg_wal_replay_resume()" returns void which cannot be compared to
276+
// null. Instead, cast it to text and compare that for a boolean result.
277+
` recovering=$(psql -Atc "SELECT CASE`,
278+
` WHEN NOT pg_catalog.pg_is_in_recovery() THEN false`,
279+
` WHEN NOT pg_catalog.pg_is_wal_replay_paused() THEN true`,
280+
` ELSE pg_catalog.pg_wal_replay_resume()::text = ''`,
281+
` END" && sleep 1) ||:`,
282+
`done`,
283+
284+
// Replay is done. Stop Postgres gracefully and move the data directory
285+
// into position for our Patroni bootstrap method.
286+
`pg_ctl stop --silent --wait`,
287+
`mv "${PGDATA}" "${PGDATA}_bootstrap"`,
288+
}, "\n")
289+
290+
return append([]string{"bash", "-ceu", "--", script, "-", pgdata}, args...)
269291
}
270292

271293
// DedicatedSnapshotVolumeRestoreCommand returns the command for performing a pgBackRest delta restore

internal/postgres/parameters.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
package postgres
66

77
import (
8+
"fmt"
9+
"slices"
810
"strings"
911
)
1012

@@ -124,3 +126,21 @@ func (ps *ParameterSet) Value(name string) string {
124126
value, _ := ps.Get(name)
125127
return value
126128
}
129+
130+
func (ps *ParameterSet) String() string {
131+
keys := make([]string, 0, len(ps.values))
132+
for k := range ps.values {
133+
keys = append(keys, k)
134+
}
135+
136+
slices.Sort(keys)
137+
138+
var b strings.Builder
139+
for _, k := range keys {
140+
_, _ = fmt.Fprintf(&b, "%s = '%s'\n", k, escapeParameterQuotes(ps.values[k]))
141+
}
142+
return b.String()
143+
}
144+
145+
// escapeParameterQuotes is used by [ParameterSet.String].
146+
var escapeParameterQuotes = strings.NewReplacer(`'`, `''`).Replace

internal/postgres/parameters_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ func TestParameterSet(t *testing.T) {
5656

5757
ps2.Add("x", "n")
5858
assert.Assert(t, ps2.Value("x") != ps.Value("x"))
59+
60+
assert.DeepEqual(t, ps.String(), ``+
61+
`abc = 'j''l'`+"\n"+
62+
`x = 'z'`+"\n")
5963
}
6064

6165
func TestParameterSetAppendToList(t *testing.T) {

0 commit comments

Comments
 (0)