Skip to content

Commit a49fdac

Browse files
committed
bugfixes and improved error handling in ConcurrentPartWorker
1 parent 6494216 commit a49fdac

File tree

2 files changed

+84
-54
lines changed

2 files changed

+84
-54
lines changed

src/include/pathman_workers.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ typedef struct
7474
pid_t pid; /* worker's PID */
7575
Oid dbid; /* database which contains the relation */
7676
Oid relid; /* table to be partitioned concurrently */
77-
uint64 total_rows; /* total amount of rows processed */
77+
int64 total_rows; /* total amount of rows processed */
7878

7979
int32 batch_size; /* number of rows in a batch */
8080
float8 sleep_time; /* how long should we sleep in case of error? */

src/pathman_workers.c

Lines changed: 83 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -443,11 +443,12 @@ free_cps_slot(int code, Datum arg)
443443
void
444444
bgw_main_concurrent_part(Datum main_arg)
445445
{
446-
int rows;
446+
ConcurrentPartSlot *part_slot;
447+
char *sql = NULL;
448+
int64 rows;
447449
bool failed;
448450
int failures_count = 0;
449-
char *sql = NULL;
450-
ConcurrentPartSlot *part_slot;
451+
LOCKMODE lockmode = RowExclusiveLock;
451452

452453
/* Update concurrent part slot */
453454
part_slot = &concurrent_part_slots[DatumGetInt32(main_arg)];
@@ -479,12 +480,14 @@ bgw_main_concurrent_part(Datum main_arg)
479480
/* Do the job */
480481
do
481482
{
482-
MemoryContext old_mcxt;
483+
MemoryContext old_mcxt;
483484

484485
Oid types[2] = { OIDOID, INT4OID };
485486
Datum vals[2] = { part_slot->relid, part_slot->batch_size };
486487
bool nulls[2] = { false, false };
487488

489+
bool rel_locked = false;
490+
488491
/* Reset loop variables */
489492
failed = false;
490493
rows = 0;
@@ -520,66 +523,89 @@ bgw_main_concurrent_part(Datum main_arg)
520523
/* Exec ret = _partition_data_concurrent() */
521524
PG_TRY();
522525
{
523-
/* Make sure that relation exists and has partitions */
524-
if (SearchSysCacheExists1(RELOID, ObjectIdGetDatum(part_slot->relid)) &&
525-
get_pathman_relation_info(part_slot->relid) != NULL)
526-
{
527-
int ret;
528-
bool isnull;
526+
int ret;
527+
bool isnull;
529528

530-
ret = SPI_execute_with_args(sql, 2, types, vals, nulls, false, 0);
531-
if (ret == SPI_OK_SELECT)
532-
{
533-
TupleDesc tupdesc = SPI_tuptable->tupdesc;
534-
HeapTuple tuple = SPI_tuptable->vals[0];
529+
/* Lock relation for DELETE and INSERT */
530+
if (!ConditionalLockRelationOid(part_slot->relid, lockmode))
531+
{
532+
elog(ERROR, "could not take lock on relation %u", part_slot->relid);
533+
}
535534

536-
Assert(SPI_processed == 1); /* there should be 1 result at most */
535+
/* Great, now relation is locked */
536+
rel_locked = true;
537537

538-
rows = DatumGetInt32(SPI_getbinval(tuple, tupdesc, 1, &isnull));
538+
/* Make sure that relation exists */
539+
if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(part_slot->relid)))
540+
{
541+
/* Exit after we raise ERROR */
542+
failures_count = PART_WORKER_MAX_ATTEMPTS;
539543

540-
Assert(!isnull); /* ... and ofc it must not be NULL */
541-
}
544+
elog(ERROR, "relation %u does not exist", part_slot->relid);
542545
}
543-
/* Otherwise it's time to exit */
544-
else
546+
547+
/* Make sure that relation has partitions */
548+
if (get_pathman_relation_info(part_slot->relid) == NULL)
545549
{
550+
/* Exit after we raise ERROR */
546551
failures_count = PART_WORKER_MAX_ATTEMPTS;
547552

548-
elog(LOG, "relation \"%u\" is not partitioned (or does not exist)",
549-
part_slot->relid);
553+
elog(ERROR, "relation \"%s\" is not partitioned",
554+
get_rel_name(part_slot->relid));
555+
}
556+
557+
/* Call concurrent partitioning function */
558+
ret = SPI_execute_with_args(sql, 2, types, vals, nulls, false, 0);
559+
if (ret == SPI_OK_SELECT)
560+
{
561+
TupleDesc tupdesc = SPI_tuptable->tupdesc;
562+
HeapTuple tuple = SPI_tuptable->vals[0];
563+
564+
/* There should be 1 result at most */
565+
Assert(SPI_processed == 1);
566+
567+
/* Extract number of processed rows */
568+
rows = DatumGetInt64(SPI_getbinval(tuple, tupdesc, 1, &isnull));
569+
Assert(!isnull); /* ... and ofc it must not be NULL */
550570
}
571+
/* Else raise generic error */
572+
else elog(ERROR, "partitioning function returned %u", ret);
573+
574+
/* Finally, unlock our partitioned table */
575+
UnlockRelationOid(part_slot->relid, lockmode);
551576
}
552577
PG_CATCH();
553578
{
554579
/*
555580
* The most common exception we can catch here is a deadlock with
556581
* concurrent user queries. Check that attempts count doesn't exceed
557-
* some reasonable value
582+
* some reasonable value.
558583
*/
559-
ErrorData *error;
560-
char *sleep_time_str;
584+
ErrorData *error;
585+
586+
/* Unlock relation if we caught ERROR too early */
587+
if (rel_locked)
588+
UnlockRelationOid(part_slot->relid, lockmode);
589+
590+
/* Increase number of failures and set 'failed' status */
591+
failures_count++;
592+
failed = true;
561593

562594
/* Switch to the original context & copy edata */
563595
MemoryContextSwitchTo(old_mcxt);
564596
error = CopyErrorData();
565597
FlushErrorState();
566598

567599
/* Print messsage for this BGWorker to server log */
568-
sleep_time_str = datum_to_cstring(Float8GetDatum(part_slot->sleep_time),
569-
FLOAT8OID);
570-
failures_count++;
571600
ereport(LOG,
572601
(errmsg("%s: %s", concurrent_part_bgw, error->message),
573-
errdetail("attempt: %d/%d, sleep time: %s",
602+
errdetail("attempt: %d/%d, sleep time: %.2f",
574603
failures_count,
575604
PART_WORKER_MAX_ATTEMPTS,
576-
sleep_time_str)));
577-
pfree(sleep_time_str); /* free the time string */
605+
(float) part_slot->sleep_time)));
578606

607+
/* Finally, free error data */
579608
FreeErrorData(error);
580-
581-
/* Set 'failed' flag */
582-
failed = true;
583609
}
584610
PG_END_TRY();
585611

@@ -606,9 +632,10 @@ bgw_main_concurrent_part(Datum main_arg)
606632
/* Failed this time, wait */
607633
else if (failed)
608634
{
609-
/* Abort transaction and sleep for a second */
635+
/* Abort transaction */
610636
AbortCurrentTransaction();
611637

638+
/* Sleep for a specified amount of time (default 1s) */
612639
DirectFunctionCall1(pg_sleep, Float8GetDatum(part_slot->sleep_time));
613640
}
614641

@@ -626,8 +653,10 @@ bgw_main_concurrent_part(Datum main_arg)
626653

627654
#ifdef USE_ASSERT_CHECKING
628655
/* Report debug message */
629-
elog(DEBUG1, "%s: relocated %d rows, total: " UINT64_FORMAT " [%u]",
630-
concurrent_part_bgw, rows, part_slot->total_rows, MyProcPid);
656+
elog(DEBUG1, "%s: "
657+
"relocated" INT64_FORMAT "rows, "
658+
"total: " INT64_FORMAT,
659+
concurrent_part_bgw, rows, part_slot->total_rows);
631660
#endif
632661
}
633662

@@ -636,9 +665,6 @@ bgw_main_concurrent_part(Datum main_arg)
636665
break;
637666
}
638667
while(rows > 0 || failed); /* do while there's still rows to be relocated */
639-
640-
/* Reclaim the resources */
641-
pfree(sql);
642668
}
643669

644670

@@ -824,26 +850,33 @@ show_concurrent_part_tasks_internal(PG_FUNCTION_ARGS)
824850
/* Iterate through worker slots */
825851
for (i = userctx->cur_idx; i < PART_WORKER_SLOTS; i++)
826852
{
827-
ConcurrentPartSlot *cur_slot = &concurrent_part_slots[i];
853+
ConcurrentPartSlot *cur_slot = &concurrent_part_slots[i],
854+
slot_copy;
828855
HeapTuple htup = NULL;
829856

830-
HOLD_INTERRUPTS();
857+
/* Copy slot to process local memory */
831858
SpinLockAcquire(&cur_slot->mutex);
859+
memcpy(&slot_copy, cur_slot, sizeof(ConcurrentPartSlot));
860+
SpinLockRelease(&cur_slot->mutex);
832861

833-
if (cur_slot->worker_status != CPS_FREE)
862+
if (slot_copy.worker_status != CPS_FREE)
834863
{
835864
Datum values[Natts_pathman_cp_tasks];
836865
bool isnull[Natts_pathman_cp_tasks] = { 0 };
837866

838-
values[Anum_pathman_cp_tasks_userid - 1] = cur_slot->userid;
839-
values[Anum_pathman_cp_tasks_pid - 1] = cur_slot->pid;
840-
values[Anum_pathman_cp_tasks_dbid - 1] = cur_slot->dbid;
841-
values[Anum_pathman_cp_tasks_relid - 1] = cur_slot->relid;
842-
values[Anum_pathman_cp_tasks_processed - 1] = cur_slot->total_rows;
867+
values[Anum_pathman_cp_tasks_userid - 1] = slot_copy.userid;
868+
values[Anum_pathman_cp_tasks_pid - 1] = slot_copy.pid;
869+
values[Anum_pathman_cp_tasks_dbid - 1] = slot_copy.dbid;
870+
values[Anum_pathman_cp_tasks_relid - 1] = slot_copy.relid;
871+
872+
/* Record processed rows */
873+
values[Anum_pathman_cp_tasks_processed - 1] =
874+
/* FIXME: use Int64GetDatum() in release 1.5 */
875+
Int32GetDatum((int32) slot_copy.total_rows);
843876

844877
/* Now build a status string */
845878
values[Anum_pathman_cp_tasks_status - 1] =
846-
CStringGetTextDatum(cps_print_status(cur_slot->worker_status));
879+
CStringGetTextDatum(cps_print_status(slot_copy.worker_status));
847880

848881
/* Form output tuple */
849882
htup = heap_form_tuple(funcctx->tuple_desc, values, isnull);
@@ -852,9 +885,6 @@ show_concurrent_part_tasks_internal(PG_FUNCTION_ARGS)
852885
userctx->cur_idx = i + 1;
853886
}
854887

855-
SpinLockRelease(&cur_slot->mutex);
856-
RESUME_INTERRUPTS();
857-
858888
/* Return tuple if needed */
859889
if (htup)
860890
SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(htup));

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy