Drizzled Public API Documentation

unique.cc
1 /* Copyright (C) 2001 MySQL AB
2 
3  This program is free software; you can redistribute it and/or modify
4  it under the terms of the GNU General Public License as published by
5  the Free Software Foundation; version 2 of the License.
6 
7  This program is distributed in the hope that it will be useful,
8  but WITHOUT ANY WARRANTY; without even the implied warranty of
9  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10  GNU General Public License for more details.
11 
12  You should have received a copy of the GNU General Public License
13  along with this program; if not, write to the Free Software
14  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
15 
16 /*
17  Function to handle quick removal of duplicates
18  This code is used when doing multi-table deletes to find the rows in
19  reference tables that needs to be deleted.
20 
21  The basic idea is as follows:
22 
23  Store first all strings in a binary tree, ignoring duplicates.
24 
25  The unique entries will be returned in sort order, to ensure that we do the
26  deletes in disk order.
27 */
28 
29 #include <config.h>
30 
31 #include <math.h>
32 
33 #include <queue>
34 
35 #include <drizzled/sql_sort.h>
36 #include <drizzled/session.h>
37 #include <drizzled/sql_list.h>
38 #include <drizzled/internal/iocache.h>
39 #include <drizzled/unique.h>
40 #include <drizzled/table.h>
41 
42 #if defined(CMATH_NAMESPACE)
43 using namespace CMATH_NAMESPACE;
44 #endif
45 
46 using namespace std;
47 
48 namespace drizzled {
49 
50 int unique_write_to_ptrs(unsigned char* key,
51  uint32_t, Unique *unique)
52 {
53  memcpy(unique->record_pointers, key, unique->size);
54  unique->record_pointers+=unique->size;
55  return 0;
56 }
57 
58 Unique::Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg,
59  uint32_t size_arg, size_t max_in_memory_size_arg)
60  : max_in_memory_size(max_in_memory_size_arg),
61  size(size_arg),
62  elements(0)
63 {
64  // Second element is max size for memory use in unique sort
65  tree.init_tree(0, 0, size, comp_func, false,
66  NULL, comp_func_fixed_arg);
67 }
68 
69 
70 /*
71  Calculate log2(n!)
72 
73  NOTES
74  Stirling's approximate formula is used:
75 
76  n! ~= sqrt(2*M_PI*n) * (n/M_E)^n
77 
78  Derivation of formula used for calculations is as follows:
79 
80  log2(n!) = log(n!)/log(2) = log(sqrt(2*M_PI*n)*(n/M_E)^n) / log(2) =
81 
82  = (log(2*M_PI*n)/2 + n*log(n/M_E)) / log(2).
83 */
84 
85 inline double log2_n_fact(double x)
86 {
87  return (log(2*M_PI*x)/2 + x*log(x/M_E)) / M_LN2;
88 }
89 
90 
91 /*
92  Calculate cost of using Unique for processing nkeys elements of size
93  key_size using max_in_memory_size memory.
94 
95  SYNOPSIS
96  Unique::get_use_cost()
97  buffer space for temporary data, use Unique::get_cost_calc_buff_size
98  to get # bytes needed.
99  nkeys #of elements in Unique
100  key_size size of each elements in bytes
101  max_in_memory_size amount of memory Unique will be allowed to use
102 
103  RETURN
104  Cost in disk seeks.
105 
106  NOTES
107  cost(using_unqiue) =
108  cost(create_trees) + (see #1)
109  cost(merge) + (see #2)
110  cost(read_result) (see #3)
111 
112  1. Cost of trees creation
113  For each Unique::put operation there will be 2*log2(n+1) elements
114  comparisons, where n runs from 1 tree_size (we assume that all added
115  elements are different). Together this gives:
116 
117  n_compares = 2*(log2(2) + log2(3) + ... + log2(N+1)) = 2*log2((N+1)!)
118 
119  then cost(tree_creation) = n_compares*ROWID_COMPARE_COST;
120 
121  Total cost of creating trees:
122  (n_trees - 1)*max_size_tree_cost + non_max_size_tree_cost.
123 
124  Approximate value of log2(N!) is calculated by log2_n_fact function.
125 
126 
127  (The Next two are historical, we do all unique operations in memory or fail)
128 
129  2. Cost of merging.
130  If only one tree is created by Unique no merging will be necessary.
131  Otherwise, we model execution of merge_many_buff function and count
132  #of merges. (The reason behind this is that number of buffers is small,
133  while size of buffers is big and we don't want to loose precision with
134  O(x)-style formula)
135 
136  3. If only one tree is created by Unique no disk io will happen.
137  Otherwise, ceil(key_len*n_keys) disk seeks are necessary. We assume
138  these will be random seeks.
139 */
140 
141 double Unique::get_use_cost(uint32_t *, uint32_t nkeys, uint32_t key_size,
142  size_t max_in_memory_size_arg)
143 {
144  ulong max_elements_in_tree;
145  ulong last_tree_elems;
146  double result;
147 
148  max_elements_in_tree= ((ulong) max_in_memory_size_arg /
149  ALIGN_SIZE(sizeof(Tree_Element)+key_size));
150 
151  last_tree_elems= nkeys % max_elements_in_tree;
152 
153  /* Calculate cost of creating trees */
154  result= 2*log2_n_fact(last_tree_elems + 1.0);
155  result /= TIME_FOR_COMPARE_ROWID;
156 
157  return result;
158 }
159 
160 Unique::~Unique()
161 {
162  tree.delete_tree();
163 }
164 
165 
166 /*
167  Clear the tree.
168  You must call reset() if you want to reuse Unique after walk().
169 */
170 
171 void
172 Unique::reset()
173 {
174  tree.reset_tree();
175  assert(elements == 0);
176 }
177 
178 
179 /*
180  DESCRIPTION
181  Walks consecutively through all unique elements:
182  if all elements are in memory, then it simply invokes 'tree_walk', else
183  all flushed trees are loaded to memory piece-by-piece, pieces are
184  sorted, and action is called for each unique value.
185  Note: so as merging resets file_ptrs state, this method can change
186  internal Unique state to undefined: if you want to reuse Unique after
187  walk() you must call reset() first!
188  SYNOPSIS
189  Unique:walk()
190  All params are 'IN':
191  action function-visitor, typed in include/tree.h
192  function is called for each unique element
193  arg argument for visitor, which is passed to it on each call
194  RETURN VALUE
195  0 OK
196  <> 0 error
197  */
198 
199 bool Unique::walk(tree_walk_action action, void *walk_action_arg)
200 {
201  return tree.tree_walk(action, walk_action_arg, left_root_right);
202 }
203 
204 /*
205  Modify the Table element so that when one calls init_records()
206  the rows will be read in priority order.
207 */
208 
209 bool Unique::get(Table *table)
210 {
211  table->sort.found_records= elements+tree.getElementsInTree();
212 
213  if ((record_pointers=table->sort.record_pointers= (unsigned char*)
214  malloc(size * tree.getElementsInTree())))
215  {
216  (void) tree.tree_walk((tree_walk_action) unique_write_to_ptrs,
217  this, left_root_right);
218  return 0;
219  }
220  /* Not enough memory */
221  return 1;
222 }
223 
224 } /* namespace drizzled */
TODO: Rename this file - func.h is stupid.
#define TIME_FOR_COMPARE_ROWID
Definition: definitions.h:150