Skip to content

Commit c2ba3fd

Browse files
committed
In basebackup.c, refactor to create read_file_data_into_buffer.
This further reduces the length and complexity of sendFile(), hopefully make it easier to understand and modify. In addition to moving some logic into a new function, I took this opportunity to make a few slight adjustments to sendFile() itself, including renaming the 'len' variable to 'bytes_done', since we use it to represent the number of bytes we've already handled so far, not the total length of the file. Patch by me, reviewed by David Steele. Discussion: http://postgr.es/m/CA+TgmoYt5jXH4U6cu1dm9Oe2FTn1aae6hBNhZzJJjyjbE_zYig@mail.gmail.com
1 parent 0531831 commit c2ba3fd

File tree

1 file changed

+133
-98
lines changed

1 file changed

+133
-98
lines changed

src/backend/backup/basebackup.c

Lines changed: 133 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,12 @@ static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeo
8383
static bool sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
8484
struct stat *statbuf, bool missing_ok, Oid dboid,
8585
backup_manifest_info *manifest, const char *spcoid);
86+
static off_t read_file_data_into_buffer(bbsink *sink,
87+
const char *readfilename, int fd,
88+
off_t offset, size_t length,
89+
BlockNumber blkno,
90+
bool verify_checksum,
91+
int *checksum_failures);
8692
static bool verify_page_checksum(Page page, XLogRecPtr start_lsn,
8793
BlockNumber blkno,
8894
uint16 *expected_checksum);
@@ -1490,9 +1496,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
14901496
BlockNumber blkno = 0;
14911497
int checksum_failures = 0;
14921498
off_t cnt;
1493-
int i;
1494-
pgoff_t len = 0;
1495-
char *page;
1499+
pgoff_t bytes_done = 0;
14961500
int segmentno = 0;
14971501
char *segmentpath;
14981502
bool verify_checksum = false;
@@ -1514,6 +1518,12 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
15141518

15151519
_tarWriteHeader(sink, tarfilename, NULL, statbuf, false);
15161520

1521+
/*
1522+
* Checksums are verified in multiples of BLCKSZ, so the buffer length
1523+
* should be a multiple of the block size as well.
1524+
*/
1525+
Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
1526+
15171527
if (!noverify_checksums && DataChecksumsEnabled())
15181528
{
15191529
char *filename;
@@ -1551,23 +1561,21 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
15511561
* for a base backup we can ignore such extended data. It will be restored
15521562
* from WAL.
15531563
*/
1554-
while (len < statbuf->st_size)
1564+
while (bytes_done < statbuf->st_size)
15551565
{
1556-
size_t remaining = statbuf->st_size - len;
1566+
size_t remaining = statbuf->st_size - bytes_done;
15571567

15581568
/* Try to read some more data. */
1559-
cnt = basebackup_read_file(fd, sink->bbs_buffer,
1560-
Min(sink->bbs_buffer_length, remaining),
1561-
len, readfilename, true);
1569+
cnt = read_file_data_into_buffer(sink, readfilename, fd, bytes_done,
1570+
remaining,
1571+
blkno + segmentno * RELSEG_SIZE,
1572+
verify_checksum,
1573+
&checksum_failures);
15621574

15631575
/*
1564-
* The checksums are verified at block level, so we iterate over the
1565-
* buffer in chunks of BLCKSZ, after making sure that
1566-
* TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
1567-
* BLCKSZ bytes.
1576+
* If the amount of data we were able to read was not a multiple of
1577+
* BLCKSZ, we cannot verify checksums, which are block-level.
15681578
*/
1569-
Assert((sink->bbs_buffer_length % BLCKSZ) == 0);
1570-
15711579
if (verify_checksum && (cnt % BLCKSZ != 0))
15721580
{
15731581
ereport(WARNING,
@@ -1578,84 +1586,6 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
15781586
verify_checksum = false;
15791587
}
15801588

1581-
if (verify_checksum)
1582-
{
1583-
for (i = 0; i < cnt / BLCKSZ; i++)
1584-
{
1585-
int reread_cnt;
1586-
uint16 expected_checksum;
1587-
1588-
page = sink->bbs_buffer + BLCKSZ * i;
1589-
1590-
/* If the page is OK, go on to the next one. */
1591-
if (verify_page_checksum(page, sink->bbs_state->startptr,
1592-
blkno + i + segmentno * RELSEG_SIZE,
1593-
&expected_checksum))
1594-
continue;
1595-
1596-
/*
1597-
* Retry the block on the first failure. It's possible that
1598-
* we read the first 4K page of the block just before postgres
1599-
* updated the entire block so it ends up looking torn to us.
1600-
* If, before we retry the read, the concurrent write of the
1601-
* block finishes, the page LSN will be updated and we'll
1602-
* realize that we should ignore this block.
1603-
*
1604-
* There's no guarantee that this will actually happen,
1605-
* though: the torn write could take an arbitrarily long time
1606-
* to complete. Retrying multiple times wouldn't fix this
1607-
* problem, either, though it would reduce the chances of it
1608-
* happening in practice. The only real fix here seems to be
1609-
* to have some kind of interlock that allows us to wait until
1610-
* we can be certain that no write to the block is in
1611-
* progress. Since we don't have any such thing right now, we
1612-
* just do this and hope for the best.
1613-
*/
1614-
reread_cnt =
1615-
basebackup_read_file(fd,
1616-
sink->bbs_buffer + BLCKSZ * i,
1617-
BLCKSZ, len + BLCKSZ * i,
1618-
readfilename,
1619-
false);
1620-
if (reread_cnt == 0)
1621-
{
1622-
/*
1623-
* If we hit end-of-file, a concurrent truncation must
1624-
* have occurred, so break out of this loop just as if the
1625-
* initial fread() returned 0. We'll drop through to the
1626-
* same code that handles that case. (We must fix up cnt
1627-
* first, though.)
1628-
*/
1629-
cnt = BLCKSZ * i;
1630-
break;
1631-
}
1632-
1633-
/* If the page now looks OK, go on to the next one. */
1634-
if (verify_page_checksum(page, sink->bbs_state->startptr,
1635-
blkno + i + segmentno * RELSEG_SIZE,
1636-
&expected_checksum))
1637-
continue;
1638-
1639-
/* Handle checksum failure. */
1640-
checksum_failures++;
1641-
if (checksum_failures <= 5)
1642-
ereport(WARNING,
1643-
(errmsg("checksum verification failed in "
1644-
"file \"%s\", block %u: calculated "
1645-
"%X but expected %X",
1646-
readfilename, blkno + i, expected_checksum,
1647-
((PageHeader) page)->pd_checksum)));
1648-
if (checksum_failures == 5)
1649-
ereport(WARNING,
1650-
(errmsg("further checksum verification "
1651-
"failures in file \"%s\" will not "
1652-
"be reported", readfilename)));
1653-
}
1654-
1655-
/* Update block number for next pass through the outer loop. */
1656-
blkno += i;
1657-
}
1658-
16591589
/*
16601590
* If we hit end-of-file, a concurrent truncation must have occurred.
16611591
* That's not an error condition, because WAL replay will fix things
@@ -1664,21 +1594,23 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
16641594
if (cnt == 0)
16651595
break;
16661596

1597+
/* Update block number and # of bytes done for next loop iteration. */
1598+
blkno += cnt / BLCKSZ;
1599+
bytes_done += cnt;
1600+
16671601
/* Archive the data we just read. */
16681602
bbsink_archive_contents(sink, cnt);
16691603

16701604
/* Also feed it to the checksum machinery. */
16711605
if (pg_checksum_update(&checksum_ctx,
16721606
(uint8 *) sink->bbs_buffer, cnt) < 0)
16731607
elog(ERROR, "could not update checksum of base backup");
1674-
1675-
len += cnt;
16761608
}
16771609

16781610
/* If the file was truncated while we were sending it, pad it with zeros */
1679-
while (len < statbuf->st_size)
1611+
while (bytes_done < statbuf->st_size)
16801612
{
1681-
size_t remaining = statbuf->st_size - len;
1613+
size_t remaining = statbuf->st_size - bytes_done;
16821614
size_t nbytes = Min(sink->bbs_buffer_length, remaining);
16831615

16841616
MemSet(sink->bbs_buffer, 0, nbytes);
@@ -1687,15 +1619,15 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
16871619
nbytes) < 0)
16881620
elog(ERROR, "could not update checksum of base backup");
16891621
bbsink_archive_contents(sink, nbytes);
1690-
len += nbytes;
1622+
bytes_done += nbytes;
16911623
}
16921624

16931625
/*
16941626
* Pad to a block boundary, per tar format requirements. (This small piece
16951627
* of data is probably not worth throttling, and is not checksummed
16961628
* because it's not actually part of the file.)
16971629
*/
1698-
_tarWritePadding(sink, len);
1630+
_tarWritePadding(sink, bytes_done);
16991631

17001632
CloseTransientFile(fd);
17011633

@@ -1718,6 +1650,109 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
17181650
return true;
17191651
}
17201652

1653+
/*
1654+
* Read some more data from the file into the bbsink's buffer, verifying
1655+
* checksums as required.
1656+
*
1657+
* 'offset' is the file offset from which we should begin to read, and
1658+
* 'length' is the amount of data that should be read. The actual amount
1659+
* of data read will be less than the requested amount if the bbsink's
1660+
* buffer isn't big enough to hold it all, or if the underlying file has
1661+
* been truncated. The return value is the number of bytes actually read.
1662+
*
1663+
* 'blkno' is the block number of the first page in the bbsink's buffer
1664+
* relative to the start of the relation.
1665+
*
1666+
* 'verify_checksum' indicates whether we should try to verify checksums
1667+
* for the blocks we read. If we do this, we'll update *checksum_failures
1668+
* and issue warnings as appropriate.
1669+
*/
1670+
static off_t
1671+
read_file_data_into_buffer(bbsink *sink, const char *readfilename, int fd,
1672+
off_t offset, size_t length, BlockNumber blkno,
1673+
bool verify_checksum, int *checksum_failures)
1674+
{
1675+
off_t cnt;
1676+
int i;
1677+
char *page;
1678+
1679+
/* Try to read some more data. */
1680+
cnt = basebackup_read_file(fd, sink->bbs_buffer,
1681+
Min(sink->bbs_buffer_length, length),
1682+
offset, readfilename, true);
1683+
1684+
/* Can't verify checksums if read length is not a multiple of BLCKSZ. */
1685+
if (!verify_checksum || (cnt % BLCKSZ) != 0)
1686+
return cnt;
1687+
1688+
/* Verify checksum for each block. */
1689+
for (i = 0; i < cnt / BLCKSZ; i++)
1690+
{
1691+
int reread_cnt;
1692+
uint16 expected_checksum;
1693+
1694+
page = sink->bbs_buffer + BLCKSZ * i;
1695+
1696+
/* If the page is OK, go on to the next one. */
1697+
if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1698+
&expected_checksum))
1699+
continue;
1700+
1701+
/*
1702+
* Retry the block on the first failure. It's possible that we read
1703+
* the first 4K page of the block just before postgres updated the
1704+
* entire block so it ends up looking torn to us. If, before we retry
1705+
* the read, the concurrent write of the block finishes, the page LSN
1706+
* will be updated and we'll realize that we should ignore this block.
1707+
*
1708+
* There's no guarantee that this will actually happen, though: the
1709+
* torn write could take an arbitrarily long time to complete.
1710+
* Retrying multiple times wouldn't fix this problem, either, though
1711+
* it would reduce the chances of it happening in practice. The only
1712+
* real fix here seems to be to have some kind of interlock that
1713+
* allows us to wait until we can be certain that no write to the
1714+
* block is in progress. Since we don't have any such thing right now,
1715+
* we just do this and hope for the best.
1716+
*/
1717+
reread_cnt =
1718+
basebackup_read_file(fd, sink->bbs_buffer + BLCKSZ * i,
1719+
BLCKSZ, offset + BLCKSZ * i,
1720+
readfilename, false);
1721+
if (reread_cnt == 0)
1722+
{
1723+
/*
1724+
* If we hit end-of-file, a concurrent truncation must have
1725+
* occurred, so reduce cnt to reflect only the blocks already
1726+
* processed and break out of this loop.
1727+
*/
1728+
cnt = BLCKSZ * i;
1729+
break;
1730+
}
1731+
1732+
/* If the page now looks OK, go on to the next one. */
1733+
if (verify_page_checksum(page, sink->bbs_state->startptr, blkno + i,
1734+
&expected_checksum))
1735+
continue;
1736+
1737+
/* Handle checksum failure. */
1738+
(*checksum_failures)++;
1739+
if (*checksum_failures <= 5)
1740+
ereport(WARNING,
1741+
(errmsg("checksum verification failed in "
1742+
"file \"%s\", block %u: calculated "
1743+
"%X but expected %X",
1744+
readfilename, blkno + i, expected_checksum,
1745+
((PageHeader) page)->pd_checksum)));
1746+
if (*checksum_failures == 5)
1747+
ereport(WARNING,
1748+
(errmsg("further checksum verification "
1749+
"failures in file \"%s\" will not "
1750+
"be reported", readfilename)));
1751+
}
1752+
1753+
return cnt;
1754+
}
1755+
17211756
/*
17221757
* Try to verify the checksum for the provided page, if it seems appropriate
17231758
* to do so.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy