casacore
ClassicalStatistics.h
Go to the documentation of this file.
1 //# Copyright (C) 2000,2001
2 //# Associated Universities, Inc. Washington DC, USA.
3 //#
4 //# This library is free software; you can redistribute it and/or modify it
5 //# under the terms of the GNU Library General Public License as published by
6 //# the Free Software Foundation; either version 2 of the License, or (at your
7 //# option) any later version.
8 //#
9 //# This library is distributed in the hope that it will be useful, but WITHOUT
10 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
12 //# License for more details.
13 //#
14 //# You should have received a copy of the GNU Library General Public License
15 //# along with this library; if not, write to the Free Software Foundation,
16 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
17 //#
18 //# Correspondence concerning AIPS++ should be addressed as follows:
19 //# Internet email: aips2-request@nrao.edu.
20 //# Postal address: AIPS++ Project Office
21 //# National Radio Astronomy Observatory
22 //# 520 Edgemont Road
23 //# Charlottesville, VA 22903-2475 USA
24 //#
25 //# $Id: Array.h 21545 2015-01-22 19:36:35Z gervandiepen $
26 
27 #ifndef SCIMATH_CLASSICALSTATS_H
28 #define SCIMATH_CLASSICALSTATS_H
29 
30 #include <casacore/casa/aips.h>
31 
32 #include <casacore/scimath/Mathematics/StatisticsAlgorithm.h>
33 
34 #include <casacore/scimath/Mathematics/StatisticsTypes.h>
35 #include <casacore/scimath/Mathematics/StatisticsUtilities.h>
36 
37 #include <set>
38 #include <vector>
39 #include <utility>
40 
41 namespace casacore {
42 
43 template <class T> class PtrHolder;
44 
45 // Class to calculate statistics in a "classical" sense, ie using accumulators with no
46 // special filtering beyond optional range filtering etc.
47 //
48 // setCalculateAsAdded() allows one to specify if statistics should be calculated and updated
49 // on upon each call to set/addData(). If False, statistics will be calculated only when
50 // getStatistic(), getStatistics(), or similar methods are called. Setting this value to True
51 // allows the caller to not have to keep all the data accessible at once. Note however, that all
52 // data must be simultaneously accessible if quantile (eg median) calculations are desired.
53 
54 // I attempted to write this class using the Composite design pattern, with eg the
55 // _unweightedStats() and _weightedStats() methods in their own class, but for reasons I
56 // don't understand, that impacted performance significantly. So I'm using the current
57 // architecture, which I know is a bit a maintenance nightmare.
58 
59 template <class AccumType, class DataIterator, class MaskIterator=const Bool*, class WeightsIterator=DataIterator>
61  : public StatisticsAlgorithm<CASA_STATP> {
62 public:
63 
65 
66  // copy semantics
68 
69  virtual ~ClassicalStatistics();
70 
71  // copy semantics
74  );
75 
76  // Clone this instance
77  virtual StatisticsAlgorithm<CASA_STATP>* clone() const;
78 
79  // get the algorithm that this object uses for computing stats
82  };
83 
84  // <group>
85  // In the following group of methods, if the size of the composite dataset
86  // is smaller than
87  // <src>binningThreshholdSizeBytes</src>, the composite dataset
88  // will be (perhaps partially) sorted and persisted in memory during the
89  // call. In that case, and if <src>persistSortedArray</src> is True, this
90  // sorted array will remain in memory after the call and will be used on
91  // subsequent calls of this method when <src>binningThreshholdSizeBytes</src>
92  // is greater than the size of the composite dataset. If
93  // <src>persistSortedArray</src> is False, the sorted array will not be
94  // stored after this call completes and so any subsequent calls for which the
95  // dataset size is less than <src>binningThreshholdSizeBytes</src>, the
96  // dataset will be sorted from scratch. Values which are not included due to
97  // non-unity strides, are not included in any specified ranges, are masked,
98  // or have associated weights of zero are not considered as dataset members
99  // for quantile computations.
100  // If one has a priori information regarding the number of points (npts) and/or
101  // the minimum and maximum values of the data set, these can be supplied to
102  // improve performance. Note however, that if these values are not correct, the
103  // resulting median
104  // and/or quantile values will also not be correct (although see the following notes regarding
105  // max/min). Note that if this object has already had getStatistics()
106  // called, and the min and max were calculated, there is no need to pass these values in
107  // as they have been stored internally and used (although passing them in shouldn't hurt
108  // anything). If provided, npts, the number of points falling in the specified ranges which are
109  // not masked and have weights > 0, should be exactly correct. <src>min</src> can be less than
110  // the true minimum, and <src>max</src> can be greater than the True maximum, but for best
111  // performance, these should be as close to the actual min and max as possible.
112  // In order for quantile computations to occur over multiple datasets, all datasets
113  // must be available. This means that if setCalculateAsAdded()
114  // was previously called by passing in a value of True, these methods will throw
115  // an exception as the previous call indicates that there is no guarantee that
116  // all datasets will be available. If one uses a data provider (by having called
117  // setDataProvider()), then this should not be an issue.
118 
119  // get the median of the distribution.
120  // For a dataset with an odd number of good points, the median is just the value
121  // at index int(N/2) in the equivalent sorted dataset, where N is the number of points.
122  // For a dataset with an even number of points, the median is the mean of the values at
123  // indices int(N/2)-1 and int(N/2) in the sorted dataset.
124  // <src>nBins</src> is the number of bins, per histogram, to use to bin the data. More
125  // bins decrease the likelihood that multiple passes of the data set will be necessary, but
126  // also increase the amount of memory used. If nBins is set to less than 1,000, it is
127  // automatically increased to 1,000; there should be no reason to ever set nBins to be
128  // this small.
129  virtual AccumType getMedian(
130  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
131  CountedPtr<AccumType> knownMax=NULL, uInt binningThreshholdSizeBytes=4096*4096,
132  Bool persistSortedArray=False, uInt64 nBins=10000
133  );
134 
135  // If one needs to compute both the median and quantile values, it is better to call
136  // getMedianAndQuantiles() rather than getMedian() and getQuantiles() separately, as the
137  // first will scan large data sets fewer times than calling the separate methods.
138  // The return value is the median; the quantiles are returned in the <src>quantiles</src> map.
139  // Values in the <src>fractions</src> set represent the locations in the CDF and should be
140  // between 0 and 1, exclusive.
141  virtual AccumType getMedianAndQuantiles(
142  std::map<Double, AccumType>& quantiles, const std::set<Double>& fractions,
143  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
144  CountedPtr<AccumType> knownMax=NULL,
145  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
146  uInt64 nBins=10000
147  );
148 
149  // get the median of the absolute deviation about the median of the data.
150  virtual AccumType getMedianAbsDevMed(
151  CountedPtr<uInt64> knownNpts=NULL,
152  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
153  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
154  uInt64 nBins=10000
155  );
156 
157  // Get the specified quantiles. <src>fractions</src> must be between 0 and 1,
158  // noninclusive.
159  virtual std::map<Double, AccumType> getQuantiles(
160  const std::set<Double>& fractions, CountedPtr<uInt64> knownNpts=NULL,
161  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
162  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
163  uInt64 nBins=10000
164  );
165 
166  // </group>
167 
168  // scan the dataset(s) that have been added, and find the min and max.
169  // This method may be called even if setStatsToCaclulate has been called and
170  // MAX and MIN has been excluded. If setCalculateAsAdded(True) has previously been
171  // called after this object has been (re)initialized, an exception will be thrown.
172  virtual void getMinMax(AccumType& mymin, AccumType& mymax);
173 
174  // scan the dataset(s) that have been added, and find the number of good points.
175  // This method may be called even if setStatsToCaclulate has been called and
176  // NPTS has been excluded. If setCalculateAsAdded(True) has previously been
177  // called after this object has been (re)initialized, an exception will be thrown.
178  virtual uInt64 getNPts();
179 
180  // see base class description
181  virtual std::pair<Int64, Int64> getStatisticIndex(StatisticsData::STATS stat);
182 
183  // Has any data been added to this object? Will return False if the object has
184  // been reset and no data have been added afterward.
185  Bool hasData() const { return _hasData; }
186 
187  // reset object to initial state. Clears all private fields including data,
188  // accumulators, etc.
189  virtual void reset();
190 
191  // Should statistics be updated with calls to addData or should they only be calculated
192  // upon calls to getStatistics etc? Beware that calling this will automatically reinitialize
193  // the object, so that it will contain no references to data et al. after this method has
194  // been called.
195  virtual void setCalculateAsAdded(Bool c);
196 
197  // An exception will be thrown if setCalculateAsAdded(True) has been called.
199 
200  void setStatsToCalculate(std::set<StatisticsData::STATS>& stats);
201 
202 protected:
203 
204  // <group>
205  // scan through the data set to determine the number of good (unmasked, weight > 0,
206  // within range) points. The first with no mask, no
207  // ranges, and no weights is trivial with npts = nr in this class, but is implemented here
208  // so that derived classes may override it.
209  inline virtual void _accumNpts(
210  uInt64& npts,
211  const DataIterator& dataBegin, Int64 nr, uInt dataStride
212  ) const;
213 
214  virtual void _accumNpts(
215  uInt64& npts,
216  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
217  const DataRanges& ranges, Bool isInclude
218  ) const;
219 
220  virtual void _accumNpts(
221  uInt64& npts,
222  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
223  const MaskIterator& maskBegin, uInt maskStride
224  ) const;
225 
226  virtual void _accumNpts(
227  uInt64& npts,
228  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
229  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
230  Bool isInclude
231  ) const;
232 
233  virtual void _accumNpts(
234  uInt64& npts,
235  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
236  Int64 nr, uInt dataStride
237  ) const;
238 
239  virtual void _accumNpts(
240  uInt64& npts,
241  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
242  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
243  ) const;
244 
245  virtual void _accumNpts(
246  uInt64& npts,
247  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
248  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
249  const DataRanges& ranges, Bool isInclude
250  ) const;
251 
252  virtual void _accumNpts(
253  uInt64& npts,
254  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
255  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
256  ) const;
257  // </group>
258 
259  // <group>
260  inline void _accumulate(
261  StatsData<AccumType>& stats, const AccumType& datum,
262  const LocationType& location
263  );
264 
265  inline void _accumulate(
266  StatsData<AccumType>& stats, const AccumType& datum,
267  const AccumType& weight, const LocationType& location
268  );
269  // </group>
270 
271  void _addData();
272 
273  void _clearStats();
274 
275  // scan dataset(s) to find min and max
276  void _doMinMax(AccumType& vmin, AccumType& vmax);
277 
278  // <group>
279  // Get the counts of data within the specified histogram bins. The number of
280  // arrays within binCounts will be equal to the number of histograms in <src>binDesc</src>.
281  // Each array within <src>binCounts</src> will have the same number of elements as the
282  // number of bins in its corresponding histogram in <src>binDesc</src>.
283  virtual void _findBins(
284  vector<vector<uInt64> >& binCounts,
285  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
286  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
287  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc,
288  const vector<AccumType>& maxLimit
289  ) const;
290 
291  virtual void _findBins(
292  vector<vector<uInt64> >& binCounts,
293  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
294  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
295  const DataRanges& ranges, Bool isInclude,
296  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
297  ) const;
298 
299  virtual void _findBins(
300  vector<vector<uInt64> >& binCounts,
301  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
302  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
303  const MaskIterator& maskBegin, uInt maskStride,
304  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
305  ) const;
306 
307  virtual void _findBins(
308  vector<vector<uInt64> >& binCounts,
309  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
310  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
311  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
312  Bool isInclude,
313  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
314  ) const;
315 
316  virtual void _findBins(
317  vector<vector<uInt64> >& binCounts,
318  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
319  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
320  Int64 nr, uInt dataStride,
321  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
322  ) const ;
323 
324  virtual void _findBins(
325  vector<vector<uInt64> >& binCounts,
326  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
327  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
328  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude,
329  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
330  ) const;
331 
332  virtual void _findBins(
333  vector<vector<uInt64> >& binCounts,
334  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
335  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
336  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
337  const DataRanges& ranges, Bool isInclude,
338  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
339  ) const;
340 
341  virtual void _findBins(
342  vector<vector<uInt64> >& binCounts,
343  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
344  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
345  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
346  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
347  ) const;
348  // </group>
349 
350  Bool _getDoMaxMin() const { return _doMaxMin; }
351 
352  Int64 _getIDataset() const { return _idataset; }
353 
354  virtual StatsData<AccumType> _getInitialStats() const;
355 
356  AccumType _getStatistic(StatisticsData::STATS stat);
357 
359 
360  // retreive stats structure. Allows derived classes to maintain their own
361  // StatsData structs.
362  inline virtual StatsData<AccumType>& _getStatsData() { return _statsData; }
363 
364  inline virtual const StatsData<AccumType>& _getStatsData() const { return _statsData; }
365 
366  // <group>
367  virtual void _minMax(
369  const DataIterator& dataBegin, Int64 nr, uInt dataStride
370  ) const;
371 
372  virtual void _minMax(
374  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
375  const DataRanges& ranges, Bool isInclude
376  ) const;
377 
378  virtual void _minMax(
380  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
381  const MaskIterator& maskBegin, uInt maskStride
382  ) const;
383 
384  virtual void _minMax(
386  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
387  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
388  Bool isInclude
389  ) const;
390 
391  virtual void _minMax(
393  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
394  Int64 nr, uInt dataStride
395  ) const;
396 
397  virtual void _minMax(
399  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
400  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
401  ) const;
402 
403  virtual void _minMax(
405  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
406  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
407  const DataRanges& ranges, Bool isInclude
408  ) const;
409 
410  virtual void _minMax(
412  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
413  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
414  ) const;
415  // </group>
416 
417  //<group>
418  // populate an unsorted array with valid data.
419  // no weights, no mask, no ranges
420  virtual void _populateArray(
421  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr, uInt dataStride
422  ) const;
423 
424  // ranges
425  virtual void _populateArray(
426  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
427  uInt dataStride, const DataRanges& ranges, Bool isInclude
428  ) const;
429 
430  virtual void _populateArray(
431  vector<AccumType>& ary, const DataIterator& dataBegin,
432  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
433  uInt maskStride
434  ) const;
435 
436  // mask and ranges
437  virtual void _populateArray(
438  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
439  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
440  const DataRanges& ranges, Bool isInclude
441  ) const;
442 
443  // weights
444  virtual void _populateArray(
445  vector<AccumType>& ary, const DataIterator& dataBegin,
446  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride
447  ) const;
448 
449  // weights and ranges
450  virtual void _populateArray(
451  vector<AccumType>& ary, const DataIterator& dataBegin,
452  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
453  const DataRanges& ranges, Bool isInclude
454  ) const;
455 
456  // weights and mask
457  virtual void _populateArray(
458  vector<AccumType>& ary, const DataIterator& dataBegin,
459  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
460  const MaskIterator& maskBegin, uInt maskStride
461  ) const;
462 
463  // weights, mask, ranges
464  virtual void _populateArray(
465  vector<AccumType>& ary, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
466  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
467  const DataRanges& ranges, Bool isInclude
468  ) const;
469  // </group>
470 
471  // <group>
472  // Create a vector of unsorted arrays, one array for each bin defined by <src>includeLimits</src>.
473  // <src>includeLimits</src> should be non-overlapping and should be given in ascending order (the
474  // algorithm used assumes this). Once the sum of the lengths of all arrays equals <src>maxCount</src>
475  // the method will return with no further processing.
476  // no weights, no mask, no ranges
477  virtual void _populateArrays(
478  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, Int64 nr, uInt dataStride,
479  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
480  ) const;
481 
482  // ranges
483  virtual void _populateArrays(
484  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, Int64 nr,
485  uInt dataStride, const DataRanges& ranges, Bool isInclude,
486  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
487  ) const;
488 
489  virtual void _populateArrays(
490  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
491  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
492  uInt maskStride,
493  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
494  ) const;
495 
496  // mask and ranges
497  virtual void _populateArrays(
498  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, Int64 nr,
499  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
500  const DataRanges& ranges, Bool isInclude,
501  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
502  ) const;
503 
504  // weights
505  virtual void _populateArrays(
506  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
507  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
508  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
509  ) const;
510 
511  // weights and ranges
512  virtual void _populateArrays(
513  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
514  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
515  const DataRanges& ranges, Bool isInclude,
516  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
517  ) const;
518 
519  // weights and mask
520  virtual void _populateArrays(
521  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
522  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
523  const MaskIterator& maskBegin, uInt maskStride,
524  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
525  ) const;
526 
527  // weights, mask, ranges
528  virtual void _populateArrays(
529  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
530  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
531  const DataRanges& ranges, Bool isInclude,
532  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
533  ) const;
534  // </group>
535 
536  // <group>
537  // no weights, no mask, no ranges
538  virtual Bool _populateTestArray(
539  vector<AccumType>& ary, const DataIterator& dataBegin,
540  Int64 nr, uInt dataStride, uInt maxElements
541  ) const;
542 
543  // ranges
544  virtual Bool _populateTestArray(
545  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
546  uInt dataStride, const DataRanges& ranges, Bool isInclude,
547  uInt maxElements
548  ) const;
549 
550  // mask
551  virtual Bool _populateTestArray(
552  vector<AccumType>& ary, const DataIterator& dataBegin,
553  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
554  uInt maskStride, uInt maxElements
555  ) const;
556 
557  // mask and ranges
558  virtual Bool _populateTestArray(
559  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
560  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
561  const DataRanges& ranges, Bool isInclude, uInt maxElements
562  ) const;
563 
564  // weights
565  virtual Bool _populateTestArray(
566  vector<AccumType>& ary, const DataIterator& dataBegin,
567  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
568  uInt maxElements
569  ) const;
570 
571  // weights and ranges
572  virtual Bool _populateTestArray(
573  vector<AccumType>& ary, const DataIterator& dataBegin,
574  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
575  const DataRanges& ranges, Bool isInclude, uInt maxElements
576  ) const;
577 
578  // weights and mask
579  virtual Bool _populateTestArray(
580  vector<AccumType>& ary, const DataIterator& dataBegin,
581  const WeightsIterator& weightBegin, Int64 nr,
582  uInt dataStride, const MaskIterator& maskBegin,
583  uInt maskStride, uInt maxElements
584  ) const;
585 
586  // weights, mask, ranges
587  virtual Bool _populateTestArray(
588  vector<AccumType>& ary, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
589  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
590  const DataRanges& ranges, Bool isInclude,
591  uInt maxElements
592  ) const;
593  // </group>
594 
595  // <group>
596  // no weights, no mask, no ranges
597  virtual void _unweightedStats(
598  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
599  const DataIterator& dataBegin, Int64 nr, uInt dataStride
600  );
601 
602  // no weights, no mask
603  virtual void _unweightedStats(
604  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
605  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
606  const DataRanges& ranges, Bool isInclude
607  );
608 
609  virtual void _unweightedStats(
610  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
611  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
612  const MaskIterator& maskBegin, uInt maskStride
613  );
614 
615  virtual void _unweightedStats(
616  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
617  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
618  const MaskIterator& maskBegin, uInt maskStride,
619  const DataRanges& ranges, Bool isInclude
620  );
621 
622  // </group>
623  virtual void _updateDataProviderMaxMin(
624  const StatsData<AccumType>& threadStats
625  );
626 
627  // <group>
628  // has weights, but no mask, no ranges
629  virtual void _weightedStats(
630  StatsData<AccumType>& stats, LocationType& location,
631  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
632  Int64 nr, uInt dataStride
633  );
634 
635  virtual void _weightedStats(
636  StatsData<AccumType>& stats, LocationType& location,
637  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
638  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
639  );
640 
641  virtual void _weightedStats(
642  StatsData<AccumType>& stats, LocationType& location,
643  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
644  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
645  );
646 
647  virtual void _weightedStats(
648  StatsData<AccumType>& stats, LocationType& location,
649  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
650  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
651  const DataRanges& ranges, Bool isInclude
652  );
653  // </group>
654 
655 private:
659  _hasData;
660 
661  // mutables, used to mitigate repeated code
662  mutable typename vector<DataIterator>::const_iterator _dend, _diter;
663  mutable vector<Int64>::const_iterator _citer;
664  mutable vector<uInt>::const_iterator _dsiter;
665  mutable std::map<uInt, MaskIterator> _masks;
666  mutable uInt _maskStride;
667  mutable std::map<uInt, WeightsIterator> _weights;
668  mutable std::map<uInt, DataRanges> _ranges;
669  mutable std::map<uInt, Bool> _isIncludeRanges;
672  mutable MaskIterator _myMask;
673  mutable DataIterator _myData;
674  mutable WeightsIterator _myWeights;
676  mutable uInt64 _myCount;
677 
678  // tally the number of data points that fall into each bin provided by <src>binDesc</src>
679  // Any points that are less than binDesc.minLimit or greater than
680  // binDesc.minLimit + binDesc.nBins*binDesc.binWidth are not included in the counts. A data
681  // point that falls exactly on a bin boundary is considered to be in the higher index bin.
682  // <src>sameVal</src> will be non-null if all the good values in the histogram range are the
683  // same. In that case, the value held will be the value of each of those data points.
684  vector<vector<uInt64> > _binCounts(
685  vector<CountedPtr<AccumType> >& sameVal,
686  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc
687  );
688 
689  void _computeBins(
690  vector<vector<uInt64> >& bins, vector<CountedPtr<AccumType> >& sameVal,
691  vector<Bool>& allSame, DataIterator dataIter, MaskIterator maskIter,
692  WeightsIterator weightsIter, uInt64 count,
693  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc,
694  const vector<AccumType>& maxLimit
695  );
696 
697  void _computeDataArray(
698  vector<AccumType>& ary, DataIterator dataIter,
699  MaskIterator maskIter, WeightsIterator weightsIter,
700  uInt64 dataCount
701  );
702 
703  void _computeDataArrays(
704  vector<vector<AccumType> >& arys, uInt64& currentCount,
705  DataIterator dataIter, MaskIterator maskIter,
706  WeightsIterator weightsIter, uInt64 dataCount,
707  const vector<std::pair<AccumType, AccumType> >& includeLimits,
708  uInt64 maxCount
709  );
710 
711  void _computeMinMax(
713  DataIterator dataIter, MaskIterator maskIter,
714  WeightsIterator weightsIter, uInt64 dataCount
715  );
716 
717  void _computeStats(
718  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
719  DataIterator dataIter, MaskIterator maskIter,
720  WeightsIterator weightsIter, uInt64 count
721  );
722 
723  // convert in place by taking the absolute value of the difference of the vector and the median
724  static void _convertToAbsDevMedArray(vector<AccumType>& myArray, AccumType median);
725 
726  // Create an unsorted array of the complete data set. If <src>includeLimits</src> is specified,
727  // only points within those limits (including min but excluding max, as per definition of bins),
728  // are included.
729  void _createDataArray(
730  vector<AccumType>& array
731  );
732 
733  void _createDataArrays(
734  vector<vector<AccumType> >& arrays,
735  const vector<std::pair<AccumType, AccumType> > &includeLimits,
736  uInt64 maxCount
737  );
738  // extract data from multiple histograms given by <src>binDesc</src>.
739  // <src>dataIndices</src> represent the indices of the sorted arrays of values to
740  // extract. There should be exactly one set of data indices to extract for each
741  // supplied histogram. The data indices are relative to the minimum value of the minimum
742  // bin in their repsective histograms. The ordering of the maps in the returned vector represent
743  // the ordering of histograms in <src>binDesc</src>. <src>binDesc</src> should contain
744  // non-overlapping histograms and the histograms should be specified in ascending order.
745  vector<std::map<uInt64, AccumType> > _dataFromMultipleBins(
746  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, uInt64 maxArraySize,
747  const vector<std::set<uInt64> >& dataIndices, uInt64 nBins
748  );
749 
750  vector<std::map<uInt64, AccumType> > _dataFromSingleBins(
751  const vector<uInt64>& binNpts, uInt64 maxArraySize,
752  const vector<std::pair<AccumType, AccumType> >& binLimits,
753  const vector<std::set<uInt64> >& dataIndices, uInt64 nBins
754  );
755 
756  Int64 _doNpts();
757 
758  // increment the relevant loop counters
759  Bool _increment(Bool includeIDataset);
760 
761  // increment thread-based iterators
763  DataIterator& dataIter, MaskIterator& maskIter,
764  WeightsIterator& weightsIter, uInt64& offset, uInt nthreads
765  ) const;
766 
767  // get the values for the specified indices in the sorted array of all good data
768  std::map<uInt64, AccumType> _indicesToValues(
769  CountedPtr<uInt64> knownNpts, CountedPtr<AccumType> knownMin,
770  CountedPtr<AccumType> knownMax, uInt64 maxArraySize,
771  const std::set<uInt64>& dataIndices, Bool persistSortedArray,
772  uInt64 nBins
773  );
774 
775  void _initIterators();
776 
777  void _initLoopVars();
778 
779  void _initThreadVars(
780  uInt& nBlocks, uInt64& extra, uInt& nthreads, PtrHolder<DataIterator>& dataIter,
781  PtrHolder<MaskIterator>& maskIter, PtrHolder<WeightsIterator>& weightsIter,
782  PtrHolder<uInt64>& offset, uInt nThreadsMax
783  ) const;
784 
785  // Determine by scanning the dataset if the number of good points is smaller than
786  // <src>maxArraySize</src>. If so, <src>arrayToSort</src> will contain the unsorted
787  // data values. If not, this vector will be empty.
788  Bool _isNptsSmallerThan(vector<AccumType>& arrayToSort, uInt maxArraySize);
789 
790  // If <src>allowPad</src> is True, then pad the lower side of the lowest bin and the
791  // higher side of the highest bin so that minData and maxData do not fall on the edge
792  // of their respective bins. If false, no padding so that minData and maxData are also
793  // exactly the histogram abscissa limits.
794  static void _makeBins(
795  typename StatisticsUtilities<AccumType>::BinDesc& bins, AccumType minData, AccumType maxData, uInt maxBins,
796  Bool allowPad
797  );
798 
799  static void _mergeResults(
800  vector<vector<uInt64> >& bins, vector<CountedPtr<AccumType> >& sameVal,
801  vector<Bool>& allSame, const PtrHolder<vector<vector<uInt64> > >& tBins,
802  const PtrHolder<vector<CountedPtr<AccumType> > >& tSameVal,
803  const PtrHolder<vector<Bool> >& tAllSame, uInt nThreadsMax
804  );
805 
806  // get the index (for odd npts) or indices (for even npts) of the median of the sorted array.
807  // If knownNpts is not null, it will be used and must be correct. If it is null, the value of
808  // _npts will be used if it has been previously calculated. If not, the data sets will
809  // be scanned to determine npts.
810  std::set<uInt64> _medianIndices(CountedPtr<uInt64> knownNpts);
811 
812  uInt _nThreadsMax() const;
813 
814  uInt _threadIdx() const;
815 
816  // get values from sorted array if the array is small enough to be held in
817  // memory. Note that this is the array containing all good data, not data in
818  // just a single bin representing a subset of good data.
819  // Returns True if the data were successfully retrieved.
820  // If True is returned, the values map will contain a map of index to value.
822  std::map<uInt64, AccumType>& values, CountedPtr<uInt64> knownNpts,
823  const std::set<uInt64>& indices, uInt64 maxArraySize,
824  Bool persistSortedArray
825  );
826 };
827 
828 }
829 
830 #ifndef CASACORE_NO_AUTO_TEMPLATES
831 #include <casacore/scimath/Mathematics/ClassicalStatistics.tcc>
832 #endif //# CASACORE_NO_AUTO_TEMPLATES
833 
834 #endif
void _doMinMax(AccumType &vmin, AccumType &vmax)
scan dataset(s) to find min and max
vector< std::map< uInt64, AccumType > > _dataFromMultipleBins(const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, uInt64 maxArraySize, const vector< std::set< uInt64 > > &dataIndices, uInt64 nBins)
extract data from multiple histograms given by binDesc.
Bool _valuesFromSortedArray(std::map< uInt64, AccumType > &values, CountedPtr< uInt64 > knownNpts, const std::set< uInt64 > &indices, uInt64 maxArraySize, Bool persistSortedArray)
get values from sorted array if the array is small enough to be held in memory.
vector< DataIterator >::const_iterator _dend
mutables, used to mitigate repeated code
long long Int64
Define the extra non-standard types used by Casacore (like proposed uSize, Size)
Definition: aipsxtype.h:38
void _computeMinMax(CountedPtr< AccumType > &mymax, CountedPtr< AccumType > &mymin, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 dataCount)
LatticeExprNode median(const LatticeExprNode &expr)
AccumType _getStatistic(StatisticsData::STATS stat)
vector< DataIterator >::const_iterator _diter
virtual void _minMax(CountedPtr< AccumType > &mymin, CountedPtr< AccumType > &mymax, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
StatsData< AccumType > _getStatistics()
virtual StatsData< AccumType > & _getStatsData()
retreive stats structure.
TableExprNode array(const TableExprNode &values, const TableExprNodeSet &shape)
Create an array of the given shape and fill it with the values.
Definition: ExprNode.h:2093
void _createDataArray(vector< AccumType > &array)
Create an unsorted array of the complete data set.
unsigned long long uInt64
Definition: aipsxtype.h:39
std::set< uInt64 > _medianIndices(CountedPtr< uInt64 > knownNpts)
get the index (for odd npts) or indices (for even npts) of the median of the sorted array...
PtrHolder(const PtrHolder< T > &other)
virtual std::pair< Int64, Int64 > getStatisticIndex(StatisticsData::STATS stat)
see base class description
ClassicalStatistics< CASA_STATP > & operator=(const ClassicalStatistics< CASA_STATP > &other)
copy semantics
std::map< uInt64, AccumType > _indicesToValues(CountedPtr< uInt64 > knownNpts, CountedPtr< AccumType > knownMin, CountedPtr< AccumType > knownMax, uInt64 maxArraySize, const std::set< uInt64 > &dataIndices, Bool persistSortedArray, uInt64 nBins)
get the values for the specified indices in the sorted array of all good data
std::pair< Int64, Int64 > LocationType
std::map< uInt, DataRanges > _ranges
void setStatsToCalculate(std::set< StatisticsData::STATS > &stats)
Provide guidance to algorithms by specifying a priori which statistics the caller would like calculat...
Class to calculate statistics in a "classical" sense, ie using accumulators with no special filtering...
void _computeBins(vector< vector< uInt64 > > &bins, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 count, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, const vector< AccumType > &maxLimit)
virtual uInt64 getNPts()
scan the dataset(s) that have been added, and find the number of good points.
Hold and delete pointers not deleted by object destructors.
Definition: PtrHolder.h:81
virtual void getMinMax(AccumType &mymin, AccumType &mymax)
scan the dataset(s) that have been added, and find the min and max.
vector< vector< uInt64 > > _binCounts(vector< CountedPtr< AccumType > > &sameVal, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc)
tally the number of data points that fall into each bin provided by binDesc Any points that are less ...
std::map< uInt, WeightsIterator > _weights
void _computeDataArray(vector< AccumType > &ary, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 dataCount)
ALGORITHM
implemented algorithms
virtual const StatsData< AccumType > & _getStatsData() const
virtual void _populateArrays(vector< vector< AccumType > > &arys, uInt64 &currentCount, const DataIterator &dataBegin, Int64 nr, uInt dataStride, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt64 maxCount) const
Create a vector of unsorted arrays, one array for each bin defined by includeLimits.
void _addData()
Allows derived classes to do things after data is set or added.
Referenced counted pointer for constant data.
Definition: CountedPtr.h:86
virtual void _accumNpts(uInt64 &npts, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
scan through the data set to determine the number of good (unmasked, weight > 0, within range) points...
virtual AccumType getMedianAbsDevMed(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
get the median of the absolute deviation about the median of the data.
std::map< uInt, Bool > _isIncludeRanges
Bool hasData() const
Has any data been added to this object? Will return False if the object has been reset and no data ha...
void _accumulate(StatsData< AccumType > &stats, const AccumType &datum, const LocationType &location)
void setDataProvider(StatsDataProvider< CASA_STATP > *dataProvider)
An exception will be thrown if setCalculateAsAdded(True) has been called.
virtual void _findBins(vector< vector< uInt64 > > &binCounts, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, const DataIterator &dataBegin, Int64 nr, uInt dataStride, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, const vector< AccumType > &maxLimit) const
Get the counts of data within the specified histogram bins.
static void _convertToAbsDevMedArray(vector< AccumType > &myArray, AccumType median)
convert in place by taking the absolute value of the difference of the vector and the median ...
void _computeDataArrays(vector< vector< AccumType > > &arys, uInt64 &currentCount, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 dataCount, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt64 maxCount)
void _initThreadVars(uInt &nBlocks, uInt64 &extra, uInt &nthreads, PtrHolder< DataIterator > &dataIter, PtrHolder< MaskIterator > &maskIter, PtrHolder< WeightsIterator > &weightsIter, PtrHolder< uInt64 > &offset, uInt nThreadsMax) const
#define DataRanges
Commonly used types in statistics framework.
bool Bool
Define the standard types used by Casacore.
Definition: aipstype.h:42
virtual Bool _populateTestArray(vector< AccumType > &ary, const DataIterator &dataBegin, Int64 nr, uInt dataStride, uInt maxElements) const
no weights, no mask, no ranges
virtual void setCalculateAsAdded(Bool c)
Should statistics be updated with calls to addData or should they only be calculated upon calls to ge...
virtual AccumType getMedianAndQuantiles(std::map< Double, AccumType > &quantiles, const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
If one needs to compute both the median and quantile values, it is better to call getMedianAndQuantil...
virtual std::map< Double, AccumType > getQuantiles(const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
Get the specified quantiles.
std::map< uInt, MaskIterator > _masks
void _createDataArrays(vector< vector< AccumType > > &arrays, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt64 maxCount)
static void _makeBins(typename StatisticsUtilities< AccumType >::BinDesc &bins, AccumType minData, AccumType maxData, uInt maxBins, Bool allowPad)
If allowPad is True, then pad the lower side of the lowest bin and the higher side of the highest bin...
vector< uInt >::const_iterator _dsiter
const Bool False
Definition: aipstype.h:44
virtual void _unweightedStats(StatsData< AccumType > &stats, uInt64 &ngood, LocationType &location, const DataIterator &dataBegin, Int64 nr, uInt dataStride)
no weights, no mask, no ranges
StatsData< AccumType > _statsData
vector< std::map< uInt64, AccumType > > _dataFromSingleBins(const vector< uInt64 > &binNpts, uInt64 maxArraySize, const vector< std::pair< AccumType, AccumType > > &binLimits, const vector< std::set< uInt64 > > &dataIndices, uInt64 nBins)
Bool _increment(Bool includeIDataset)
increment the relevant loop counters
virtual void reset()
reset object to initial state.
virtual StatisticsAlgorithm< CASA_STATP > * clone() const
Clone this instance.
virtual StatisticsData::ALGORITHM algorithm() const
get the algorithm that this object uses for computing stats
vector< Int64 >::const_iterator _citer
const Double c
Fundamental physical constants (SI units):
virtual AccumType getMedian(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
In the following group of methods, if the size of the composite dataset is smaller than binningThresh...
static void _mergeResults(vector< vector< uInt64 > > &bins, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, const PtrHolder< vector< vector< uInt64 > > > &tBins, const PtrHolder< vector< CountedPtr< AccumType > > > &tSameVal, const PtrHolder< vector< Bool > > &tAllSame, uInt nThreadsMax)
virtual void _weightedStats(StatsData< AccumType > &stats, LocationType &location, const DataIterator &dataBegin, const WeightsIterator &weightsBegin, Int64 nr, uInt dataStride)
has weights, but no mask, no ranges
void _incrementThreadIters(DataIterator &dataIter, MaskIterator &maskIter, WeightsIterator &weightsIter, uInt64 &offset, uInt nthreads) const
increment thread-based iterators
Bool _isNptsSmallerThan(vector< AccumType > &arrayToSort, uInt maxArraySize)
Determine by scanning the dataset if the number of good points is smaller than maxArraySize.
virtual void _updateDataProviderMaxMin(const StatsData< AccumType > &threadStats)
virtual void _populateArray(vector< AccumType > &ary, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
populate an unsorted array with valid data.
virtual StatsData< AccumType > _getInitialStats() const
void _computeStats(StatsData< AccumType > &stats, uInt64 &ngood, LocationType &location, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 count)
Base class of statistics algorithm class hierarchy.
this file contains all the compiler specific defines
Definition: mainpage.dox:28
unsigned int uInt
Definition: aipstype.h:51
description of a regularly spaced bins with the first bin having lower limit of minLimit and having n...