@@ -3751,6 +3751,177 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
3751
3751
return numdistinct ;
3752
3752
}
3753
3753
3754
+ /*
3755
+ * Try to estimate the bucket size of the hash join inner side when the join
3756
+ * condition contains two or more clauses by employing extended statistics.
3757
+ *
3758
+ * The main idea of this approach is that the distinct value generated by
3759
+ * multivariate estimation on two or more columns would provide less bucket size
3760
+ * than estimation on one separate column.
3761
+ *
3762
+ * IMPORTANT: It is crucial to synchronise the approach of combining different
3763
+ * estimations with the caller's method.
3764
+ * Return a list of clauses that didn't fetch any extended statistics.
3765
+ */
3766
+ List *
3767
+ estimate_multivariate_bucketsize (PlannerInfo * root , RelOptInfo * inner ,
3768
+ List * hashclauses ,
3769
+ Selectivity * innerbucketsize )
3770
+ {
3771
+ List * clauses = list_copy (hashclauses );
3772
+ List * otherclauses = NIL ;
3773
+ double ndistinct = 1.0 ;
3774
+
3775
+ if (list_length (hashclauses ) <= 1 )
3776
+ /*
3777
+ * Nothing to do for a single clause. Could we employ univariate
3778
+ * extended stat here?
3779
+ */
3780
+ return hashclauses ;
3781
+
3782
+ while (clauses != NIL )
3783
+ {
3784
+ ListCell * lc ;
3785
+ int relid = -1 ;
3786
+ List * varinfos = NIL ;
3787
+ List * saveList = NIL ;
3788
+ double mvndistinct ;
3789
+ List * origin_varinfos ;
3790
+ int group_relid = -1 ;
3791
+ RelOptInfo * group_rel ;
3792
+ ListCell * lc1 , * lc2 ;
3793
+
3794
+ /*
3795
+ * Find clauses, referencing the same single base relation and try to
3796
+ * estimate such a group with extended statistics.
3797
+ * Create varinfo for an approved clause, push it to otherclauses, if it
3798
+ * can't be estimated here or ignore to process at the next iteration.
3799
+ */
3800
+ foreach (lc , clauses )
3801
+ {
3802
+ RestrictInfo * rinfo = lfirst_node (RestrictInfo , lc );
3803
+ Node * expr ;
3804
+ Relids relids ;
3805
+ GroupVarInfo * varinfo ;
3806
+
3807
+ /*
3808
+ * Find the inner side of the join which we need to estimate the
3809
+ * number of buckets. Use outer_is_left because the
3810
+ * clause_sides_match_join routine has called on hash clauses.
3811
+ */
3812
+ relids = rinfo -> outer_is_left ?
3813
+ rinfo -> right_relids : rinfo -> left_relids ;
3814
+ expr = rinfo -> outer_is_left ?
3815
+ get_rightop (rinfo -> clause ) : get_leftop (rinfo -> clause );
3816
+
3817
+ if (bms_get_singleton_member (relids , & relid ) &&
3818
+ root -> simple_rel_array [relid ]-> statlist != NIL )
3819
+ {
3820
+ /*
3821
+ * This inner-side expression references only one relation.
3822
+ * Extended statistics on this clause can exist.
3823
+ */
3824
+
3825
+ if (group_relid < 0 )
3826
+ {
3827
+ RangeTblEntry * rte = root -> simple_rte_array [relid ];
3828
+
3829
+ if (!rte || (rte -> relkind != RELKIND_RELATION &&
3830
+ rte -> relkind != RELKIND_MATVIEW &&
3831
+ rte -> relkind != RELKIND_FOREIGN_TABLE &&
3832
+ rte -> relkind != RELKIND_PARTITIONED_TABLE ))
3833
+ {
3834
+ /* Extended statistics can't exist in principle */
3835
+ otherclauses = lappend (otherclauses , rinfo );
3836
+ clauses = foreach_delete_current (clauses , lc );
3837
+ continue ;
3838
+ }
3839
+
3840
+ group_relid = relid ;
3841
+ group_rel = root -> simple_rel_array [relid ];
3842
+ Assert (group_rel != NULL );
3843
+ }
3844
+ else if (group_relid != relid )
3845
+ /*
3846
+ * Being in the group forming state we don't
3847
+ * need other clauses.
3848
+ */
3849
+ continue ;
3850
+
3851
+ varinfo = (GroupVarInfo * ) palloc (sizeof (GroupVarInfo ));
3852
+ varinfo -> var = expr ;
3853
+ varinfo -> rel = root -> simple_rel_array [relid ];
3854
+ varinfo -> ndistinct = 0.0 ;
3855
+ varinfo -> isdefault = false;
3856
+ varinfos = lappend (varinfos , varinfo );
3857
+
3858
+ /*
3859
+ * Remember the link to RestrictInfo for the case the clause is
3860
+ * failed to be estimated.
3861
+ */
3862
+ saveList = lappend (saveList , rinfo );
3863
+ }
3864
+ else
3865
+ /* This clause can't be estimated with extended statistics */
3866
+ otherclauses = lappend (otherclauses , rinfo );
3867
+
3868
+ clauses = foreach_delete_current (clauses , lc );
3869
+ }
3870
+
3871
+ if (list_length (varinfos ) < 2 )
3872
+ {
3873
+ /*
3874
+ * Multivariate statistics doesn't apply to single column except
3875
+ * expressions, but it has not implemented yet.
3876
+ */
3877
+ otherclauses = list_concat (otherclauses , saveList );
3878
+ list_free_deep (varinfos );
3879
+ list_free (saveList );
3880
+ continue ;
3881
+ }
3882
+
3883
+ /* Let's employ extended statistics. */
3884
+ origin_varinfos = varinfos ;
3885
+ for (;;)
3886
+ {
3887
+ bool estimated = estimate_multivariate_ndistinct (root ,
3888
+ group_rel ,
3889
+ & varinfos ,
3890
+ & mvndistinct );
3891
+ if (!estimated )
3892
+ break ;
3893
+
3894
+ /*
3895
+ * We've got an estimation. Use ndistinct value in consistent way -
3896
+ * according to the caller's logic (see final_cost_hashjoin).
3897
+ */
3898
+ if (ndistinct < mvndistinct )
3899
+ ndistinct = mvndistinct ;
3900
+ Assert (ndistinct >= 1.0 );
3901
+ }
3902
+
3903
+ Assert (list_length (origin_varinfos ) == list_length (saveList ));
3904
+
3905
+ /*
3906
+ * Remove already estimated clauses from the saveList
3907
+ */
3908
+ forboth (lc1 , origin_varinfos , lc2 , saveList )
3909
+ {
3910
+ GroupVarInfo * vinfo = lfirst (lc1 );
3911
+
3912
+ if (!list_member_ptr (varinfos , vinfo ))
3913
+ /* Already estimated */
3914
+ continue ;
3915
+
3916
+ /* Can't be estimated here - push to the returning list */
3917
+ otherclauses = lappend (otherclauses , lfirst (lc2 ));
3918
+ }
3919
+ }
3920
+
3921
+ * innerbucketsize = 1.0 / ndistinct ;
3922
+ return otherclauses ;
3923
+ }
3924
+
3754
3925
/*
3755
3926
* Estimate hash bucket statistics when the specified expression is used
3756
3927
* as a hash key for the given number of buckets.
0 commit comments