@@ -166,6 +166,9 @@ write_backup_status(pgBackup *backup, BackupStatus status,
166166 *
167167 * TODO: lock-timeout as parameter
168168 * TODO: we must think about more fine grain unlock mechanism - separate unlock_backup() function.
169+ * TODO: more accurate naming
170+ * -> exclusive lock -> acquire HW_LATCH and wait until all LW_LATCH`es are clear
171+ * -> shared lock -> acquire HW_LATCH, acquire LW_LATCH, release HW_LATCH
169172 */
170173bool
171174lock_backup (pgBackup * backup , bool strict , bool exclusive )
@@ -205,7 +208,7 @@ lock_backup(pgBackup *backup, bool strict, bool exclusive)
205208 {
206209 /* release exclusive lock */
207210 if (fio_unlink (lock_file , FIO_BACKUP_HOST ) < 0 )
208- elog (ERROR , "Could not remove old lock file \"%s\": %s" ,
211+ elog (ERROR , "Could not remove exclusive lock file \"%s\": %s" ,
209212 lock_file , strerror (errno ));
210213
211214 /* we are done */
@@ -261,48 +264,16 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
261264 int fd = 0 ;
262265 char buffer [MAXPGPATH * 2 + 256 ];
263266 int ntries = LOCK_TIMEOUT ;
264- int log_freq = ntries / 5 ;
267+ int empty_tries = LOCK_STALE_TIMEOUT ;
265268 int len ;
266269 int encoded_pid ;
267- pid_t my_p_pid ;
268270
269271 join_path_components (lock_file , backup -> root_dir , BACKUP_LOCK_FILE );
270272
271- /*
272- * TODO: is this stuff with ppid below is relevant for us ?
273- *
274- * If the PID in the lockfile is our own PID or our parent's or
275- * grandparent's PID, then the file must be stale (probably left over from
276- * a previous system boot cycle). We need to check this because of the
277- * likelihood that a reboot will assign exactly the same PID as we had in
278- * the previous reboot, or one that's only one or two counts larger and
279- * hence the lockfile's PID now refers to an ancestor shell process. We
280- * allow pg_ctl to pass down its parent shell PID (our grandparent PID)
281- * via the environment variable PG_GRANDPARENT_PID; this is so that
282- * launching the postmaster via pg_ctl can be just as reliable as
283- * launching it directly. There is no provision for detecting
284- * further-removed ancestor processes, but if the init script is written
285- * carefully then all but the immediate parent shell will be root-owned
286- * processes and so the kill test will fail with EPERM. Note that we
287- * cannot get a false negative this way, because an existing postmaster
288- * would surely never launch a competing postmaster or pg_ctl process
289- * directly.
290- */
291- #ifndef WIN32
292- my_p_pid = getppid ();
293- #else
294-
295- /*
296- * Windows hasn't got getppid(), but doesn't need it since it's not using
297- * real kill() either...
298- */
299- my_p_pid = 0 ;
300- #endif
301-
302273 /*
303274 * We need a loop here because of race conditions. But don't loop forever
304275 * (for example, a non-writable $backup_instance_path directory might cause a failure
305- * that won't go away). 100 tries seems like plenty.
276+ * that won't go away).
306277 */
307278 do
308279 {
@@ -351,13 +322,38 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
351322 fclose (fp_out );
352323
353324 /*
354- * It should be possible only as a result of system crash,
355- * so its hypothetical owner should be dead by now
325+ * There are several possible reasons for lock file
326+ * to be empty:
327+ * - system crash
328+ * - process crash
329+ * - race between writer and reader
330+ *
331+ * Consider empty file to be stale after LOCK_STALE_TIMEOUT attempts.
332+ *
333+ * TODO: alternatively we can write into temp file (lock_file_%pid),
334+ * rename it and then re-read lock file to make sure,
335+ * that we are successfully acquired the lock.
356336 */
357337 if (len == 0 )
358338 {
359- elog (WARNING , "Lock file \"%s\" is empty" , lock_file );
360- goto grab_lock ;
339+ if (empty_tries == 0 )
340+ {
341+ elog (WARNING , "Lock file \"%s\" is empty" , lock_file );
342+ goto grab_lock ;
343+ }
344+
345+ if ((empty_tries % LOG_FREQ ) == 0 )
346+ elog (WARNING , "Waiting %u seconds on empty exclusive lock for backup %s" ,
347+ empty_tries , base36enc (backup -> start_time ));
348+
349+ sleep (1 );
350+ /*
351+ * waiting on empty lock file should not affect
352+ * the timer for concurrent lockers (ntries).
353+ */
354+ empty_tries -- ;
355+ ntries ++ ;
356+ continue ;
361357 }
362358
363359 encoded_pid = atoi (buffer );
@@ -371,24 +367,23 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
371367
372368 /*
373369 * Check to see if the other process still exists
374- *
375- * Per discussion above, my_pid, my_p_pid can be
376- * ignored as false matches.
377- *
378370 * Normally kill() will fail with ESRCH if the given PID doesn't
379371 * exist.
380372 */
381- if (encoded_pid != my_pid && encoded_pid != my_p_pid )
373+ if (encoded_pid == my_pid )
374+ return 0 ;
375+ else
382376 {
383377 if (kill (encoded_pid , 0 ) == 0 )
384378 {
385379 /* complain every fifth interval */
386- if ((ntries % log_freq ) == 0 )
380+ if ((ntries % LOG_FREQ ) == 0 )
387381 {
388382 elog (WARNING , "Process %d is using backup %s, and is still running" ,
389383 encoded_pid , base36enc (backup -> start_time ));
390384
391- elog (WARNING , "Waiting %u seconds on lock for backup %s" , ntries , base36enc (backup -> start_time ));
385+ elog (WARNING , "Waiting %u seconds on exclusive lock for backup %s" ,
386+ ntries , base36enc (backup -> start_time ));
392387 }
393388
394389 sleep (1 );
@@ -435,7 +430,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
435430 errno = 0 ;
436431 if (fio_write (fd , buffer , strlen (buffer )) != strlen (buffer ))
437432 {
438- int save_errno = errno ;
433+ int save_errno = errno ;
439434
440435 fio_close (fd );
441436 fio_unlink (lock_file , FIO_BACKUP_HOST );
@@ -453,7 +448,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
453448
454449 if (fio_flush (fd ) != 0 )
455450 {
456- int save_errno = errno ;
451+ int save_errno = errno ;
457452
458453 fio_close (fd );
459454 fio_unlink (lock_file , FIO_BACKUP_HOST );
@@ -471,7 +466,7 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
471466
472467 if (fio_close (fd ) != 0 )
473468 {
474- int save_errno = errno ;
469+ int save_errno = errno ;
475470
476471 fio_unlink (lock_file , FIO_BACKUP_HOST );
477472
@@ -482,6 +477,10 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
482477 lock_file , strerror (save_errno ));
483478 }
484479
480+ // elog(LOG, "Acquired exclusive lock for backup %s after %ds",
481+ // base36enc(backup->start_time),
482+ // LOCK_TIMEOUT - ntries + LOCK_STALE_TIMEOUT - empty_tries);
483+
485484 return 0 ;
486485}
487486
@@ -493,7 +492,6 @@ wait_read_only_owners(pgBackup *backup)
493492 char buffer [256 ];
494493 pid_t encoded_pid ;
495494 int ntries = LOCK_TIMEOUT ;
496- int log_freq = ntries / 5 ;
497495 char lock_file [MAXPGPATH ];
498496
499497 join_path_components (lock_file , backup -> root_dir , BACKUP_RO_LOCK_FILE );
@@ -523,7 +521,7 @@ wait_read_only_owners(pgBackup *backup)
523521 {
524522 if (kill (encoded_pid , 0 ) == 0 )
525523 {
526- if ((ntries % log_freq ) == 0 )
524+ if ((ntries % LOG_FREQ ) == 0 )
527525 {
528526 elog (WARNING , "Process %d is using backup %s in read only mode, and is still running" ,
529527 encoded_pid , base36enc (backup -> start_time ));
0 commit comments