firtst release
修訂 | a257715e4f47510944c06c9f11805bcc88824e3f (tree) |
---|---|
時間 | 2018-06-08 14:11:06 |
作者 | Kyotaro Horiguchi <horiguchi.kyotaro@lab....> |
Commiter | Kyotaro Horiguchi |
Took in changes of pg_stat_statements.
Took in the following commits on pg_stat_statements. These commits
change unused part of the file so they don't affect the behavior of
pg_hint_plan.
4f37d09169 Avoid unlikely data-loss scenarios due to rename() without fsync.
93840f96c7 Improve contrib/pg_stat_statements' handling of garbage collection failure.
@@ -171,7 +171,7 @@ typedef struct pgssEntry | ||
171 | 171 | pgssHashKey key; /* hash key of entry - MUST BE FIRST */ |
172 | 172 | Counters counters; /* the statistics for this query */ |
173 | 173 | Size query_offset; /* query text offset in external file */ |
174 | - int query_len; /* # of valid bytes in query string */ | |
174 | + int query_len; /* # of valid bytes in query string, or -1 */ | |
175 | 175 | int encoding; /* query text encoding */ |
176 | 176 | slock_t mutex; /* protects the counters only */ |
177 | 177 | } pgssEntry; |
@@ -745,11 +745,7 @@ pgss_shmem_shutdown(int code, Datum arg) | ||
745 | 745 | /* |
746 | 746 | * Rename file into place, so we atomically replace any old one. |
747 | 747 | */ |
748 | - if (rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE) != 0) | |
749 | - ereport(LOG, | |
750 | - (errcode_for_file_access(), | |
751 | - errmsg("could not rename pg_stat_statement file \"%s\": %m", | |
752 | - PGSS_DUMP_FILE ".tmp"))); | |
748 | + (void) durable_rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE, LOG); | |
753 | 749 | |
754 | 750 | /* Unlink query-texts file; it's not needed while shutdown */ |
755 | 751 | unlink(PGSS_TEXT_FILE); |
@@ -1649,7 +1645,8 @@ entry_cmp(const void *lhs, const void *rhs) | ||
1649 | 1645 | } |
1650 | 1646 | |
1651 | 1647 | /* |
1652 | - * Deallocate least used entries. | |
1648 | + * Deallocate least-used entries. | |
1649 | + * | |
1653 | 1650 | * Caller must hold an exclusive lock on pgss->lock. |
1654 | 1651 | */ |
1655 | 1652 | static void |
@@ -1660,17 +1657,27 @@ entry_dealloc(void) | ||
1660 | 1657 | pgssEntry *entry; |
1661 | 1658 | int nvictims; |
1662 | 1659 | int i; |
1663 | - Size totlen = 0; | |
1660 | + Size tottextlen; | |
1661 | + int nvalidtexts; | |
1664 | 1662 | |
1665 | 1663 | /* |
1666 | 1664 | * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them. |
1667 | 1665 | * While we're scanning the table, apply the decay factor to the usage |
1668 | - * values. | |
1666 | + * values, and update the mean query length. | |
1667 | + * | |
1668 | + * Note that the mean query length is almost immediately obsolete, since | |
1669 | + * we compute it before not after discarding the least-used entries. | |
1670 | + * Hopefully, that doesn't affect the mean too much; it doesn't seem worth | |
1671 | + * making two passes to get a more current result. Likewise, the new | |
1672 | + * cur_median_usage includes the entries we're about to zap. | |
1669 | 1673 | */ |
1670 | 1674 | |
1671 | 1675 | entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *)); |
1672 | 1676 | |
1673 | 1677 | i = 0; |
1678 | + tottextlen = 0; | |
1679 | + nvalidtexts = 0; | |
1680 | + | |
1674 | 1681 | hash_seq_init(&hash_seq, pgss_hash); |
1675 | 1682 | while ((entry = hash_seq_search(&hash_seq)) != NULL) |
1676 | 1683 | { |
@@ -1680,20 +1687,27 @@ entry_dealloc(void) | ||
1680 | 1687 | entry->counters.usage *= STICKY_DECREASE_FACTOR; |
1681 | 1688 | else |
1682 | 1689 | entry->counters.usage *= USAGE_DECREASE_FACTOR; |
1683 | - /* Accumulate total size, too. */ | |
1684 | - totlen += entry->query_len + 1; | |
1690 | + /* In the mean length computation, ignore dropped texts. */ | |
1691 | + if (entry->query_len >= 0) | |
1692 | + { | |
1693 | + tottextlen += entry->query_len + 1; | |
1694 | + nvalidtexts++; | |
1695 | + } | |
1685 | 1696 | } |
1686 | 1697 | |
1698 | + /* Sort into increasing order by usage */ | |
1687 | 1699 | qsort(entries, i, sizeof(pgssEntry *), entry_cmp); |
1688 | 1700 | |
1701 | + /* Record the (approximate) median usage */ | |
1689 | 1702 | if (i > 0) |
1690 | - { | |
1691 | - /* Record the (approximate) median usage */ | |
1692 | 1703 | pgss->cur_median_usage = entries[i / 2]->counters.usage; |
1693 | - /* Record the mean query length */ | |
1694 | - pgss->mean_query_len = totlen / i; | |
1695 | - } | |
1704 | + /* Record the mean query length */ | |
1705 | + if (nvalidtexts > 0) | |
1706 | + pgss->mean_query_len = tottextlen / nvalidtexts; | |
1707 | + else | |
1708 | + pgss->mean_query_len = ASSUMED_LENGTH_INIT; | |
1696 | 1709 | |
1710 | + /* Now zap an appropriate fraction of lowest-usage entries */ | |
1697 | 1711 | nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100); |
1698 | 1712 | nvictims = Min(nvictims, i); |
1699 | 1713 |
@@ -1836,7 +1850,7 @@ qtext_load_file(Size *buffer_size) | ||
1836 | 1850 | } |
1837 | 1851 | |
1838 | 1852 | /* Allocate buffer; beware that off_t might be wider than size_t */ |
1839 | - if (stat.st_size <= MaxAllocSize) | |
1853 | + if (stat.st_size <= MaxAllocHugeSize) | |
1840 | 1854 | buf = (char *) malloc(stat.st_size); |
1841 | 1855 | else |
1842 | 1856 | buf = NULL; |
@@ -1844,7 +1858,9 @@ qtext_load_file(Size *buffer_size) | ||
1844 | 1858 | { |
1845 | 1859 | ereport(LOG, |
1846 | 1860 | (errcode(ERRCODE_OUT_OF_MEMORY), |
1847 | - errmsg("out of memory"))); | |
1861 | + errmsg("out of memory"), | |
1862 | + errdetail("Could not allocate enough memory to read pg_stat_statement file \"%s\".", | |
1863 | + PGSS_TEXT_FILE))); | |
1848 | 1864 | CloseTransientFile(fd); |
1849 | 1865 | return NULL; |
1850 | 1866 | } |
@@ -1946,13 +1962,17 @@ need_gc_qtexts(void) | ||
1946 | 1962 | * occur in the foreseeable future. |
1947 | 1963 | * |
1948 | 1964 | * The caller must hold an exclusive lock on pgss->lock. |
1965 | + * | |
1966 | + * At the first sign of trouble we unlink the query text file to get a clean | |
1967 | + * slate (although existing statistics are retained), rather than risk | |
1968 | + * thrashing by allowing the same problem case to recur indefinitely. | |
1949 | 1969 | */ |
1950 | 1970 | static void |
1951 | 1971 | gc_qtexts(void) |
1952 | 1972 | { |
1953 | 1973 | char *qbuffer; |
1954 | 1974 | Size qbuffer_size; |
1955 | - FILE *qfile; | |
1975 | + FILE *qfile = NULL; | |
1956 | 1976 | HASH_SEQ_STATUS hash_seq; |
1957 | 1977 | pgssEntry *entry; |
1958 | 1978 | Size extent; |
@@ -1967,12 +1987,15 @@ gc_qtexts(void) | ||
1967 | 1987 | return; |
1968 | 1988 | |
1969 | 1989 | /* |
1970 | - * Load the old texts file. If we fail (out of memory, for instance) just | |
1971 | - * skip the garbage collection. | |
1990 | + * Load the old texts file. If we fail (out of memory, for instance), | |
1991 | + * invalidate query texts. Hopefully this is rare. It might seem better | |
1992 | + * to leave things alone on an OOM failure, but the problem is that the | |
1993 | + * file is only going to get bigger; hoping for a future non-OOM result is | |
1994 | + * risky and can easily lead to complete denial of service. | |
1972 | 1995 | */ |
1973 | 1996 | qbuffer = qtext_load_file(&qbuffer_size); |
1974 | 1997 | if (qbuffer == NULL) |
1975 | - return; | |
1998 | + goto gc_fail; | |
1976 | 1999 | |
1977 | 2000 | /* |
1978 | 2001 | * We overwrite the query texts file in place, so as to reduce the risk of |
@@ -2007,6 +2030,7 @@ gc_qtexts(void) | ||
2007 | 2030 | /* Trouble ... drop the text */ |
2008 | 2031 | entry->query_offset = 0; |
2009 | 2032 | entry->query_len = -1; |
2033 | + /* entry will not be counted in mean query length computation */ | |
2010 | 2034 | continue; |
2011 | 2035 | } |
2012 | 2036 |
@@ -2091,7 +2115,36 @@ gc_fail: | ||
2091 | 2115 | entry->query_len = -1; |
2092 | 2116 | } |
2093 | 2117 | |
2094 | - /* Seems like a good idea to bump the GC count even though we failed */ | |
2118 | + /* | |
2119 | + * Destroy the query text file and create a new, empty one | |
2120 | + */ | |
2121 | + (void) unlink(PGSS_TEXT_FILE); | |
2122 | + qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W); | |
2123 | + if (qfile == NULL) | |
2124 | + ereport(LOG, | |
2125 | + (errcode_for_file_access(), | |
2126 | + errmsg("could not write new pg_stat_statement file \"%s\": %m", | |
2127 | + PGSS_TEXT_FILE))); | |
2128 | + else | |
2129 | + FreeFile(qfile); | |
2130 | + | |
2131 | + /* Reset the shared extent pointer */ | |
2132 | + pgss->extent = 0; | |
2133 | + | |
2134 | + /* Reset mean_query_len to match the new state */ | |
2135 | + pgss->mean_query_len = ASSUMED_LENGTH_INIT; | |
2136 | + | |
2137 | + /* | |
2138 | + * Bump the GC count even though we failed. | |
2139 | + * | |
2140 | + * This is needed to make concurrent readers of file without any lock on | |
2141 | + * pgss->lock notice existence of new version of file. Once readers | |
2142 | + * subsequently observe a change in GC count with pgss->lock held, that | |
2143 | + * forces a safe reopen of file. Writers also require that we bump here, | |
2144 | + * of course. (As required by locking protocol, readers and writers don't | |
2145 | + * trust earlier file contents until gc_count is found unchanged after | |
2146 | + * pgss->lock acquired in shared or exclusive mode respectively.) | |
2147 | + */ | |
2095 | 2148 | record_gc_qtexts(); |
2096 | 2149 | } |
2097 | 2150 |