casacore
ClassicalStatistics.h
Go to the documentation of this file.
1 //# Copyright (C) 2000,2001
2 //# Associated Universities, Inc. Washington DC, USA.
3 //#
4 //# This library is free software; you can redistribute it and/or modify it
5 //# under the terms of the GNU Library General Public License as published by
6 //# the Free Software Foundation; either version 2 of the License, or (at your
7 //# option) any later version.
8 //#
9 //# This library is distributed in the hope that it will be useful, but WITHOUT
10 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
12 //# License for more details.
13 //#
14 //# You should have received a copy of the GNU Library General Public License
15 //# along with this library; if not, write to the Free Software Foundation,
16 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
17 //#
18 //# Correspondence concerning AIPS++ should be addressed as follows:
19 //# Internet email: aips2-request@nrao.edu.
20 //# Postal address: AIPS++ Project Office
21 //# National Radio Astronomy Observatory
22 //# 520 Edgemont Road
23 //# Charlottesville, VA 22903-2475 USA
24 //#
25 //# $Id: Array.h 21545 2015-01-22 19:36:35Z gervandiepen $
26 
27 #ifndef SCIMATH_CLASSICALSTATS_H
28 #define SCIMATH_CLASSICALSTATS_H
29 
30 #include <casacore/casa/aips.h>
31 
32 #include <casacore/scimath/Mathematics/StatisticsAlgorithm.h>
33 
34 #include <casacore/scimath/Mathematics/StatisticsTypes.h>
35 #include <casacore/scimath/Mathematics/StatisticsUtilities.h>
36 
37 #include <set>
38 #include <vector>
39 #include <utility>
40 
41 namespace casacore {
42 
43 // Class to calculate statistics in a "classical" sense, ie using accumulators with no
44 // special filtering beyond optional range filtering etc.
45 //
46 // setCalculateAsAdded() allows one to specify if statistics should be calculated and updated
47 // on upon each call to set/addData(). If False, statistics will be calculated only when
48 // getStatistic(), getStatistics(), or similar methods are called. Setting this value to True
49 // allows the caller to not have to keep all the data accessible at once. Note however, that all
50 // data must be simultaneously accessible if quantile (eg median) calculations are desired.
51 
52 // I attempted to write this class using the Composite design pattern, with eg the
53 // _unweightedStats() and _weightedStats() methods in their own class, but for reasons I
54 // don't understand, that impacted performance significantly. So I'm using the current
55 // architecture, which I know is a bit a maintenance nightmare.
56 
57 template <class AccumType, class DataIterator, class MaskIterator=const Bool*, class WeightsIterator=DataIterator>
59  : public StatisticsAlgorithm<AccumType, DataIterator, MaskIterator, WeightsIterator> {
60 public:
61 
63 
64  // copy semantics
66 
67  virtual ~ClassicalStatistics();
68 
69  // copy semantics
72  );
73 
74  // get the algorithm that this object uses for computing stats
77  };
78 
79  // <group>
80  // In the following group of methods, if the size of the composite dataset
81  // is smaller than
82  // <src>binningThreshholdSizeBytes</src>, the composite dataset
83  // will be (perhaps partially) sorted and persisted in memory during the
84  // call. In that case, and if <src>persistSortedArray</src> is True, this
85  // sorted array will remain in memory after the call and will be used on
86  // subsequent calls of this method when <src>binningThreshholdSizeBytes</src>
87  // is greater than the size of the composite dataset. If
88  // <src>persistSortedArray</src> is False, the sorted array will not be
89  // stored after this call completes and so any subsequent calls for which the
90  // dataset size is less than <src>binningThreshholdSizeBytes</src>, the
91  // dataset will be sorted from scratch. Values which are not included due to
92  // non-unity strides, are not included in any specified ranges, are masked,
93  // or have associated weights of zero are not considered as dataset members
94  // for quantile computations.
95  // If one has a priori information regarding the number of points (npts) and/or
96  // the minimum and maximum values of the data set, these can be supplied to
97  // improve performance. Note however, that if these values are not correct, the
98  // resulting median
99  // and/or quantile values will also not be correct (although see the following notes regarding
100  // max/min). Note that if this object has already had getStatistics()
101  // called, and the min and max were calculated, there is no need to pass these values in
102  // as they have been stored internally and used (although passing them in shouldn't hurt
103  // anything). If provided, npts, the number of points falling in the specified ranges which are
104  // not masked and have weights > 0, should be exactly correct. <src>min</src> can be less than
105  // the true minimum, and <src>max</src> can be greater than the True maximum, but for best
106  // performance, these should be as close to the actual min and max as possible.
107  // In order for quantile computations to occur over multiple datasets, all datasets
108  // must be available. This means that if setCalculateAsAdded()
109  // was previously called by passing in a value of True, these methods will throw
110  // an exception as the previous call indicates that there is no guarantee that
111  // all datasets will be available. If one uses a data provider (by having called
112  // setDataProvider()), then this should not be an issue.
113 
114  // get the median of the distribution.
115  // For a dataset with an odd number of good points, the median is just the value
116  // at index int(N/2) in the equivalent sorted dataset, where N is the number of points.
117  // For a dataset with an even number of points, the median is the mean of the values at
118  // indices int(N/2)-1 and int(N/2) in the sorted dataset.
119  // <src>nBins</src> is the number of bins, per histogram, to use to bin the data. More
120  // bins decrease the likelihood that multiple passes of the data set will be necessary, but
121  // also increase the amount of memory used. If nBins is set to less than 1,000, it is
122  // automatically increased to 1,000; there should be no reason to ever set nBins to be
123  // this small.
124  virtual AccumType getMedian(
125  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
126  CountedPtr<AccumType> knownMax=NULL, uInt binningThreshholdSizeBytes=4096*4096,
127  Bool persistSortedArray=False, uInt64 nBins=10000
128  );
129 
130  // If one needs to compute both the median and quantile values, it is better to call
131  // getMedianAndQuantiles() rather than getMedian() and getQuantiles() separately, as the
132  // first will scan large data sets fewer times than calling the separate methods.
133  // The return value is the median; the quantiles are returned in the <src>quantiles</src> map.
134  // Values in the <src>fractions</src> set represent the locations in the CDF and should be
135  // between 0 and 1, exclusive.
136  virtual AccumType getMedianAndQuantiles(
137  std::map<Double, AccumType>& quantiles, const std::set<Double>& fractions,
138  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
139  CountedPtr<AccumType> knownMax=NULL,
140  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
141  uInt64 nBins=10000
142  );
143 
144  // get the median of the absolute deviation about the median of the data.
145  virtual AccumType getMedianAbsDevMed(
146  CountedPtr<uInt64> knownNpts=NULL,
147  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
148  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
149  uInt64 nBins=10000
150  );
151 
152  // Get the specified quantiles. <src>fractions</src> must be between 0 and 1,
153  // noninclusive.
154  virtual std::map<Double, AccumType> getQuantiles(
155  const std::set<Double>& fractions, CountedPtr<uInt64> knownNpts=NULL,
156  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
157  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
158  uInt64 nBins=10000
159  );
160 
161  // </group>
162 
163  // scan the dataset(s) that have been added, and find the min and max.
164  // This method may be called even if setStatsToCaclulate has been called and
165  // MAX and MIN has been excluded. If setCalculateAsAdded(True) has previously been
166  // called after this object has been (re)initialized, an exception will be thrown.
167  virtual void getMinMax(AccumType& mymin, AccumType& mymax);
168 
169  // scan the dataset(s) that have been added, and find the number of good points.
170  // This method may be called even if setStatsToCaclulate has been called and
171  // NPTS has been excluded. If setCalculateAsAdded(True) has previously been
172  // called after this object has been (re)initialized, an exception will be thrown.
173  virtual uInt64 getNPts();
174 
175  // see base class description
176  virtual std::pair<Int64, Int64> getStatisticIndex(StatisticsData::STATS stat);
177 
178  // reset object to initial state. Clears all private fields including data,
179  // accumulators, etc.
180  virtual void reset();
181 
182  // Should statistics be updated with calls to addData or should they only be calculated
183  // upon calls to getStatistics etc? Beware that calling this will automatically reinitialize
184  // the object, so that it will contain no references to data et al. after this method has
185  // been called.
186  virtual void setCalculateAsAdded(Bool c);
187 
188  // An exception will be thrown if setCalculateAsAdded(True) has been called.
190 
191  void setStatsToCalculate(std::set<StatisticsData::STATS>& stats);
192 
193 protected:
194 
195  // <group>
196  // scan through the data set to determine the number of good (unmasked, weight > 0,
197  // within range) points. The first with no mask, no
198  // ranges, and no weights is trivial with npts = nr in this class, but is implemented here
199  // so that derived classes may override it.
200  inline virtual void _accumNpts(
201  uInt64& npts,
202  const DataIterator& dataBegin, Int64 nr, uInt dataStride
203  ) const;
204 
205  virtual void _accumNpts(
206  uInt64& npts,
207  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
208  const DataRanges& ranges, Bool isInclude
209  ) const;
210 
211  virtual void _accumNpts(
212  uInt64& npts,
213  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
214  const MaskIterator& maskBegin, uInt maskStride
215  ) const;
216 
217  virtual void _accumNpts(
218  uInt64& npts,
219  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
220  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
221  Bool isInclude
222  ) const;
223 
224  virtual void _accumNpts(
225  uInt64& npts,
226  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
227  Int64 nr, uInt dataStride
228  ) const;
229 
230  virtual void _accumNpts(
231  uInt64& npts,
232  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
233  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
234  ) const;
235 
236  virtual void _accumNpts(
237  uInt64& npts,
238  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
239  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
240  const DataRanges& ranges, Bool isInclude
241  ) const;
242 
243  virtual void _accumNpts(
244  uInt64& npts,
245  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
246  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
247  ) const;
248  // </group>
249 
250  // <group>
251  inline void _accumulate(
252  AccumType& mymin, AccumType& mymax, Int64& minpos, Int64& maxpos,
253  const AccumType& datum , Int64 count
254  );
255 
256  inline void _accumulate(
257  AccumType& mymin, AccumType& mymax, Int64& minpos, Int64& maxpos, const AccumType& datum,
258  const AccumType& weight, Int64 count
259  );
260  // </group>
261 
262  void _addData();
263 
264  void _clearData();
265 
266  void _clearStats();
267 
268  // scan dataset(s) to find min and max
269  void _doMinMax(AccumType& vmin, AccumType& vmax);
270 
271  // <group>
272  // Get the counts of data within the specified histogram bins. The number of
273  // arrays within binCounts will be equal to the number of histograms in <src>binDesc</src>.
274  // Each array within <src>binCounts</src> will have the same number of elements as the
275  // number of bins in its corresponding histogram in <src>binDesc</src>.
276  virtual void _findBins(
277  vector<vector<uInt64> >& binCounts,
278  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
279  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
280  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc,
281  const vector<AccumType>& maxLimit
282  ) const;
283 
284  virtual void _findBins(
285  vector<vector<uInt64> >& binCounts,
286  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
287  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
288  const DataRanges& ranges, Bool isInclude,
289  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
290  ) const;
291 
292  virtual void _findBins(
293  vector<vector<uInt64> >& binCounts,
294  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
295  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
296  const MaskIterator& maskBegin, uInt maskStride,
297  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
298  ) const;
299 
300  virtual void _findBins(
301  vector<vector<uInt64> >& binCounts,
302  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
303  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
304  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
305  Bool isInclude,
306  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
307  ) const;
308 
309  virtual void _findBins(
310  vector<vector<uInt64> >& binCounts,
311  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
312  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
313  Int64 nr, uInt dataStride,
314  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
315  ) const ;
316 
317  virtual void _findBins(
318  vector<vector<uInt64> >& binCounts,
319  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
320  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
321  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude,
322  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
323  ) const;
324 
325  virtual void _findBins(
326  vector<vector<uInt64> >& binCounts,
327  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
328  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
329  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
330  const DataRanges& ranges, Bool isInclude,
331  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
332  ) const;
333 
334  virtual void _findBins(
335  vector<vector<uInt64> >& binCounts,
336  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
337  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
338  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
339  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
340  ) const;
341  // </group>
342 
343  AccumType _getStatistic(StatisticsData::STATS stat);
344 
346 
347  // retreive stats structure. Allows derived classes to maintain their own
348  // StatsData structs.
349  inline virtual StatsData<AccumType>& _getStatsData() { return _statsData; }
350 
351  inline virtual const StatsData<AccumType>& _getStatsData() const { return _statsData; }
352 
353  // <group>
354  virtual void _minMax(
356  const DataIterator& dataBegin, Int64 nr, uInt dataStride
357  ) const;
358 
359  virtual void _minMax(
361  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
362  const DataRanges& ranges, Bool isInclude
363  ) const;
364 
365  virtual void _minMax(
367  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
368  const MaskIterator& maskBegin, uInt maskStride
369  ) const;
370 
371  virtual void _minMax(
373  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
374  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
375  Bool isInclude
376  ) const;
377 
378  virtual void _minMax(
380  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
381  Int64 nr, uInt dataStride
382  ) const;
383 
384  virtual void _minMax(
386  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
387  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
388  ) const;
389 
390  virtual void _minMax(
392  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
393  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
394  const DataRanges& ranges, Bool isInclude
395  ) const;
396 
397  virtual void _minMax(
399  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
400  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
401  ) const;
402  // </group>
403 
404  //<group>
405  // populate an unsorted array with valid data.
406  // no weights, no mask, no ranges
407  virtual void _populateArray(
408  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr, uInt dataStride
409  ) const;
410 
411  // ranges
412  virtual void _populateArray(
413  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
414  uInt dataStride, const DataRanges& ranges, Bool isInclude
415  ) const;
416 
417  virtual void _populateArray(
418  vector<AccumType>& ary, const DataIterator& dataBegin,
419  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
420  uInt maskStride
421  ) const;
422 
423  // mask and ranges
424  virtual void _populateArray(
425  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
426  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
427  const DataRanges& ranges, Bool isInclude
428  ) const;
429 
430  // weights
431  virtual void _populateArray(
432  vector<AccumType>& ary, const DataIterator& dataBegin,
433  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride
434  ) const;
435 
436  // weights and ranges
437  virtual void _populateArray(
438  vector<AccumType>& ary, const DataIterator& dataBegin,
439  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
440  const DataRanges& ranges, Bool isInclude
441  ) const;
442 
443  // weights and mask
444  virtual void _populateArray(
445  vector<AccumType>& ary, const DataIterator& dataBegin,
446  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
447  const MaskIterator& maskBegin, uInt maskStride
448  ) const;
449 
450  // weights, mask, ranges
451  virtual void _populateArray(
452  vector<AccumType>& ary, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
453  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
454  const DataRanges& ranges, Bool isInclude
455  ) const;
456  // </group>
457 
458  // <group>
459  // Create a vector of unsorted arrays, one array for each bin defined by <src>includeLimits</src>.
460  // <src>includeLimits</src> should be non-overlapping and should be given in ascending order (the
461  // algorithm used assumes this). Once the sum of the lengths of all arrays equals <src>maxCount</src>
462  // the method will return with no further processing.
463  // no weights, no mask, no ranges
464  virtual void _populateArrays(
465  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin, Int64 nr, uInt dataStride,
466  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
467  ) const;
468 
469  // ranges
470  virtual void _populateArrays(
471  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin, Int64 nr,
472  uInt dataStride, const DataRanges& ranges, Bool isInclude,
473  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
474  ) const;
475 
476  virtual void _populateArrays(
477  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin,
478  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
479  uInt maskStride,
480  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
481  ) const;
482 
483  // mask and ranges
484  virtual void _populateArrays(
485  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin, Int64 nr,
486  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
487  const DataRanges& ranges, Bool isInclude,
488  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
489  ) const;
490 
491  // weights
492  virtual void _populateArrays(
493  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin,
494  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
495  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
496  ) const;
497 
498  // weights and ranges
499  virtual void _populateArrays(
500  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin,
501  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
502  const DataRanges& ranges, Bool isInclude,
503  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
504  ) const;
505 
506  // weights and mask
507  virtual void _populateArrays(
508  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin,
509  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
510  const MaskIterator& maskBegin, uInt maskStride,
511  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
512  ) const;
513 
514  // weights, mask, ranges
515  virtual void _populateArrays(
516  vector<vector<AccumType> >& arys, uInt& currentCount, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
517  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
518  const DataRanges& ranges, Bool isInclude,
519  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt maxCount
520  ) const;
521  // </group>
522 
523  // <group>
524  // no weights, no mask, no ranges
525  virtual Bool _populateTestArray(
526  vector<AccumType>& ary, const DataIterator& dataBegin,
527  Int64 nr, uInt dataStride, uInt maxElements
528  ) const;
529 
530  // ranges
531  virtual Bool _populateTestArray(
532  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
533  uInt dataStride, const DataRanges& ranges, Bool isInclude,
534  uInt maxElements
535  ) const;
536 
537  // mask
538  virtual Bool _populateTestArray(
539  vector<AccumType>& ary, const DataIterator& dataBegin,
540  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
541  uInt maskStride, uInt maxElements
542  ) const;
543 
544  // mask and ranges
545  virtual Bool _populateTestArray(
546  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
547  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
548  const DataRanges& ranges, Bool isInclude, uInt maxElements
549  ) const;
550 
551  // weights
552  virtual Bool _populateTestArray(
553  vector<AccumType>& ary, const DataIterator& dataBegin,
554  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
555  uInt maxElements
556  ) const;
557 
558  // weights and ranges
559  virtual Bool _populateTestArray(
560  vector<AccumType>& ary, const DataIterator& dataBegin,
561  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
562  const DataRanges& ranges, Bool isInclude, uInt maxElements
563  ) const;
564 
565  // weights and mask
566  virtual Bool _populateTestArray(
567  vector<AccumType>& ary, const DataIterator& dataBegin,
568  const WeightsIterator& weightBegin, Int64 nr,
569  uInt dataStride, const MaskIterator& maskBegin,
570  uInt maskStride, uInt maxElements
571  ) const;
572 
573  // weights, mask, ranges
574  virtual Bool _populateTestArray(
575  vector<AccumType>& ary, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
576  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
577  const DataRanges& ranges, Bool isInclude,
578  uInt maxElements
579  ) const;
580  // </group>
581 
582  // <group>
583  // no weights, no mask, no ranges
584  virtual void _unweightedStats(
585  uInt64& ngood, AccumType& mymin, AccumType& mymax,
586  Int64& minpos, Int64& maxpos,
587  const DataIterator& dataBegin, Int64 nr, uInt dataStride
588  );
589 
590  // no weights, no mask
591  virtual void _unweightedStats(
592  uInt64& ngood, AccumType& mymin, AccumType& mymax,
593  Int64& minpos, Int64& maxpos,
594  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
595  const DataRanges& ranges, Bool isInclude
596 
597  );
598 
599  virtual void _unweightedStats(
600  uInt64& ngood, AccumType& mymin, AccumType& mymax,
601  Int64& minpos, Int64& maxpos,
602  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
603  const MaskIterator& maskBegin, uInt maskStride
604 
605  );
606 
607  virtual void _unweightedStats(
608  uInt64& ngood, AccumType& mymin, AccumType& mymax,
609  Int64& minpos, Int64& maxpos,
610  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
611  const MaskIterator& maskBegin, uInt maskStride,
612  const DataRanges& ranges, Bool isInclude
613  );
614  // </group>
615 
616  // <group>
617  // has weights, but no mask, no ranges
618  virtual void _weightedStats(
619  AccumType& mymin, AccumType& mymax,
620  Int64& minpos, Int64& maxpos,
621  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
622  Int64 nr, uInt dataStride
623  );
624 
625  virtual void _weightedStats(
626  AccumType& mymin, AccumType& mymax,
627  Int64& minpos, Int64& maxpos,
628  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
629  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
630  );
631 
632  virtual void _weightedStats(
633  AccumType& mymin, AccumType& mymax,
634  Int64& minpos, Int64& maxpos,
635  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
636  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
637  );
638 
639  virtual void _weightedStats(
640  AccumType& mymin, AccumType& mymax,
641  Int64& minpos, Int64& maxpos,
642  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
643  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
644  const DataRanges& ranges, Bool isInclude
645  );
646  // </group>
647 
648 
649 private:
653 
654  // mutables, used to mitigate repeated code
655  mutable typename vector<DataIterator>::const_iterator _dend, _diter;
656  mutable vector<Int64>::const_iterator _citer;
657  mutable vector<uInt>::const_iterator _dsiter;
658  mutable std::map<uInt, MaskIterator> _masks;
659  mutable uInt _maskStride;
660  mutable std::map<uInt, WeightsIterator> _weights;
661  mutable std::map<uInt, DataRanges> _ranges;
662  mutable std::map<uInt, Bool> _isIncludeRanges;
665  mutable MaskIterator _myMask;
666  mutable DataIterator _myData;
667  mutable WeightsIterator _myWeights;
669  mutable uInt64 _myCount;
670 
671  // tally the number of data points that fall into each bin provided by <src>binDesc</src>
672  // Any points that are less than binDesc.minLimit or greater than
673  // binDesc.minLimit + binDesc.nBins*binDesc.binWidth are not included in the counts. A data
674  // point that falls exactly on a bin boundary is considered to be in the higher index bin.
675  // <src>sameVal</src> will be non-null if all the good values in the histogram range are the
676  // same. In that case, the value held will be the value of each of those data points.
677  vector<vector<uInt64> > _binCounts(
678  vector<CountedPtr<AccumType> >& sameVal,
679  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc
680  );
681 
682  // convert in place by taking the absolute value of the difference of the vector and the median
683  static void _convertToAbsDevMedArray(vector<AccumType>& myArray, AccumType median);
684 
685  // Create an unsorted array of the complete data set. If <src>includeLimits</src> is specified,
686  // only points within those limits (including min but excluding max, as per definition of bins),
687  // are included.
688  void _createDataArray(
689  vector<AccumType>& array
690  );
691 
692  void _createDataArrays(
693  vector<vector<AccumType> >& arrays,
694  const vector<std::pair<AccumType, AccumType> > &includeLimits,
695  uInt maxCount
696  );
697  // extract data from multiple histograms given by <src>binDesc</src>.
698  // <src>dataIndices</src> represent the indices of the sorted arrays of values to
699  // extract. There should be exactly one set of data indices to extract for each
700  // supplied histogram. The data indices are relative to the minimum value of the minimum
701  // bin in their repsective histograms. The ordering of the maps in the returned vector represent
702  // the ordering of histograms in <src>binDesc</src>. <src>binDesc</src> should contain
703  // non-overlapping histograms and the histograms should be specified in ascending order.
704  vector<std::map<uInt64, AccumType> > _dataFromMultipleBins(
705  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, uInt maxArraySize,
706  const vector<std::set<uInt64> >& dataIndices, uInt64 nBins
707  );
708 
709  vector<std::map<uInt64, AccumType> > _dataFromSingleBins(
710  const vector<uInt64>& binNpts, uInt maxArraySize,
711  const vector<std::pair<AccumType, AccumType> >& binLimits,
712  const vector<std::set<uInt64> >& dataIndices, uInt64 nBins
713  );
714 
715  Int64 _doNpts();
716 
717  // get the values for the specified indices in the sorted array of all good data
718  std::map<uInt64, AccumType> _indicesToValues(
719  CountedPtr<uInt64> knownNpts, CountedPtr<AccumType> knownMin,
720  CountedPtr<AccumType> knownMax, uInt maxArraySize,
721  const std::set<uInt64>& dataIndices, Bool persistSortedArray,
722  uInt64 nBins
723  );
724 
725  void _initIterators();
726 
727  void _initLoopVars();
728 
729  // Determine by scanning the dataset if the number of good points is smaller than
730  // <src>maxArraySize</src>. If so, <src>arrayToSort</src> will contain the unsorted
731  // data values. If not, this vector will be empty.
732  Bool _isNptsSmallerThan(vector<AccumType>& arrayToSort, uInt maxArraySize);
733 
734  // If <src>allowPad</src> is True, then pad the lower side of the lowest bin and the
735  // higher side of the highest bin so that minData and maxData do not fall on the edge
736  // of their respective bins. If false, no padding so that minData and maxData are also
737  // exactly the histogram abscissa limits.
738  static void _makeBins(
739  typename StatisticsUtilities<AccumType>::BinDesc& bins, AccumType minData, AccumType maxData, uInt maxBins,
740  Bool allowPad
741  );
742 
743  // If input set has one value, that is the median, if it has two, the median is the average
744  // of those.
745  // static AccumType _medianFromSet(const std::map<uInt64, AccumType>& values);
746 
747  // get the index (for odd npts) or indices (for even npts) of the median of the sorted array.
748  // If knownNpts is not null, it will be used and must be correct. If it is null, the value of
749  // _npts will be used if it has been previously calculated. If not, the data sets will
750  // be scanned to determine npts.
751  std::set<uInt64> _medianIndices(CountedPtr<uInt64> knownNpts);
752 
753  // update min and max if necessary
754  virtual void _updateMaxMin(
755  AccumType mymin, AccumType mymax, Int64 minpos,
756  Int64 maxpos, uInt dataStride, const Int64& currentDataset
757  );
758 
759  // get values from sorted array if the array is small enough to be held in
760  // memory. Note that this is the array containing all good data, not data in
761  // just a single bin representing a subset of good data.
762  // Returns True if the data were successfully retrieved.
763  // If True is returned, the values map will contain a map of index to value.
765  std::map<uInt64, AccumType>& values, CountedPtr<uInt64> knownNpts,
766  const std::set<uInt64>& indices, uInt maxArraySize,
767  Bool persistSortedArray
768  );
769 };
770 
771 }
772 
773 #ifndef CASACORE_NO_AUTO_TEMPLATES
774 #include <casacore/scimath/Mathematics/ClassicalStatistics.tcc>
775 #endif //# CASACORE_NO_AUTO_TEMPLATES
776 
777 #endif
void _doMinMax(AccumType &vmin, AccumType &vmax)
scan dataset(s) to find min and max
vector< DataIterator >::const_iterator _dend
mutables, used to mitigate repeated code
long long Int64
Define the extra non-standard types used by Casacore (like proposed uSize, Size)
Definition: aipsxtype.h:38
virtual void _populateArray(vector< AccumType > &ary, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
populate an unsorted array with valid data.
virtual void _updateMaxMin(AccumType mymin, AccumType mymax, Int64 minpos, Int64 maxpos, uInt dataStride, const Int64 &currentDataset)
update min and max if necessary
LatticeExprNode median(const LatticeExprNode &expr)
AccumType _getStatistic(StatisticsData::STATS stat)
virtual StatisticsData::ALGORITHM algorithm() const
get the algorithm that this object uses for computing stats
ClassicalStatistics< AccumType, DataIterator, MaskIterator, WeightsIterator > & operator=(const ClassicalStatistics< AccumType, DataIterator, MaskIterator, WeightsIterator > &other)
copy semantics
vector< DataIterator >::const_iterator _diter
StatsData< AccumType > _getStatistics()
virtual StatsData< AccumType > & _getStatsData()
retreive stats structure.
TableExprNode array(const TableExprNode &values, const TableExprNodeSet &shape)
Create an array of the given shape and fill it with the values.
Definition: ExprNode.h:1986
void _createDataArray(vector< AccumType > &array)
Create an unsorted array of the complete data set.
virtual void _weightedStats(AccumType &mymin, AccumType &mymax, Int64 &minpos, Int64 &maxpos, const DataIterator &dataBegin, const WeightsIterator &weightsBegin, Int64 nr, uInt dataStride)
has weights, but no mask, no ranges
unsigned long long uInt64
Definition: aipsxtype.h:39
std::set< uInt64 > _medianIndices(CountedPtr< uInt64 > knownNpts)
If input set has one value, that is the median, if it has two, the median is the average of those...
virtual std::pair< Int64, Int64 > getStatisticIndex(StatisticsData::STATS stat)
see base class description
std::map< uInt, DataRanges > _ranges
void setStatsToCalculate(std::set< StatisticsData::STATS > &stats)
Provide guidance to algorithms by specifying a priori which statistics the caller would like calculat...
Abstract base class which defines interface for providing "datasets" to the statistics framework when...
virtual void _populateArrays(vector< vector< AccumType > > &arys, uInt &currentCount, const DataIterator &dataBegin, Int64 nr, uInt dataStride, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt maxCount) const
Create a vector of unsorted arrays, one array for each bin defined by includeLimits.
Class to calculate statistics in a "classical" sense, ie using accumulators with no special filtering...
virtual uInt64 getNPts()
scan the dataset(s) that have been added, and find the number of good points.
virtual void getMinMax(AccumType &mymin, AccumType &mymax)
scan the dataset(s) that have been added, and find the min and max.
vector< vector< uInt64 > > _binCounts(vector< CountedPtr< AccumType > > &sameVal, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc)
tally the number of data points that fall into each bin provided by binDesc Any points that are less ...
std::map< uInt, WeightsIterator > _weights
virtual void _unweightedStats(uInt64 &ngood, AccumType &mymin, AccumType &mymax, Int64 &minpos, Int64 &maxpos, const DataIterator &dataBegin, Int64 nr, uInt dataStride)
no weights, no mask, no ranges
ALGORITHM
implemented algorithms
void _addData()
Allows derived classes to do things after data is set or added.
Referenced counted pointer for constant data.
Definition: CountedPtr.h:86
virtual void _findBins(vector< vector< uInt64 > > &binCounts, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, const DataIterator &dataBegin, Int64 nr, uInt dataStride, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, const vector< AccumType > &maxLimit) const
Get the counts of data within the specified histogram bins.
virtual Bool _populateTestArray(vector< AccumType > &ary, const DataIterator &dataBegin, Int64 nr, uInt dataStride, uInt maxElements) const
no weights, no mask, no ranges
virtual AccumType getMedianAbsDevMed(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
get the median of the absolute deviation about the median of the data.
std::map< uInt, Bool > _isIncludeRanges
static void _convertToAbsDevMedArray(vector< AccumType > &myArray, AccumType median)
convert in place by taking the absolute value of the difference of the vector and the median ...
void _createDataArrays(vector< vector< AccumType > > &arrays, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt maxCount)
#define DataRanges
Commonly used types in statistics framework.
bool Bool
Define the standard types used by Casacore.
Definition: aipstype.h:39
virtual void setCalculateAsAdded(Bool c)
Should statistics be updated with calls to addData or should they only be calculated upon calls to ge...
void setDataProvider(StatsDataProvider< AccumType, DataIterator, MaskIterator, WeightsIterator > *dataProvider)
An exception will be thrown if setCalculateAsAdded(True) has been called.
virtual AccumType getMedianAndQuantiles(std::map< Double, AccumType > &quantiles, const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
If one needs to compute both the median and quantile values, it is better to call getMedianAndQuantil...
virtual std::map< Double, AccumType > getQuantiles(const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
Get the specified quantiles.
std::map< uInt, MaskIterator > _masks
static void _makeBins(typename StatisticsUtilities< AccumType >::BinDesc &bins, AccumType minData, AccumType maxData, uInt maxBins, Bool allowPad)
If allowPad is True, then pad the lower side of the lowest bin and the higher side of the highest bin...
vector< uInt >::const_iterator _dsiter
const Bool False
Definition: aipstype.h:41
void _accumulate(AccumType &mymin, AccumType &mymax, Int64 &minpos, Int64 &maxpos, const AccumType &datum, Int64 count)
vector< std::map< uInt64, AccumType > > _dataFromMultipleBins(const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, uInt maxArraySize, const vector< std::set< uInt64 > > &dataIndices, uInt64 nBins)
extract data from multiple histograms given by binDesc.
StatsData< AccumType > _statsData
virtual void reset()
reset object to initial state.
virtual void _accumNpts(uInt64 &npts, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
scan through the data set to determine the number of good (unmasked, weight > 0, within range) points...
virtual const StatsData< AccumType > & _getStatsData() const
vector< Int64 >::const_iterator _citer
const Double c
Fundamental physical constants (SI units):
virtual AccumType getMedian(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
In the following group of methods, if the size of the composite dataset is smaller than binningThresh...
std::map< uInt64, AccumType > _indicesToValues(CountedPtr< uInt64 > knownNpts, CountedPtr< AccumType > knownMin, CountedPtr< AccumType > knownMax, uInt maxArraySize, const std::set< uInt64 > &dataIndices, Bool persistSortedArray, uInt64 nBins)
get the values for the specified indices in the sorted array of all good data
Bool _isNptsSmallerThan(vector< AccumType > &arrayToSort, uInt maxArraySize)
Determine by scanning the dataset if the number of good points is smaller than maxArraySize.
Bool _valuesFromSortedArray(std::map< uInt64, AccumType > &values, CountedPtr< uInt64 > knownNpts, const std::set< uInt64 > &indices, uInt maxArraySize, Bool persistSortedArray)
get values from sorted array if the array is small enough to be held in memory.
vector< std::map< uInt64, AccumType > > _dataFromSingleBins(const vector< uInt64 > &binNpts, uInt maxArraySize, const vector< std::pair< AccumType, AccumType > > &binLimits, const vector< std::set< uInt64 > > &dataIndices, uInt64 nBins)
virtual void _minMax(CountedPtr< AccumType > &mymin, CountedPtr< AccumType > &mymax, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
Base class of statistics algorithm class hierarchy.
this file contains all the compiler specific defines
Definition: mainpage.dox:28
unsigned int uInt
Definition: aipstype.h:48
description of a regularly spaced bins with the first bin having lower limit of minLimit and having n...