casacore
StatisticsAlgorithm.h
Go to the documentation of this file.
1 //# Copyright (C) 2000,2001
2 //# Associated Universities, Inc. Washington DC, USA.
3 //#
4 //# This library is free software; you can redistribute it and/or modify it
5 //# under the terms of the GNU Library General Public License as published by
6 //# the Free Software Foundation; either version 2 of the License, or (at your
7 //# option) any later version.
8 //#
9 //# This library is distributed in the hope that it will be useful, but WITHOUT
10 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
12 //# License for more details.
13 //#
14 //# You should have received a copy of the GNU Library General Public License
15 //# along with this library; if not, write to the Free Software Foundation,
16 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
17 //#
18 //# Correspondence concerning AIPS++ should be addressed as follows:
19 //# Internet email: aips2-request@nrao.edu.
20 //# Postal address: AIPS++ Project Office
21 //# National Radio Astronomy Observatory
22 //# 520 Edgemont Road
23 //# Charlottesville, VA 22903-2475 USA
24 //#
25 //# $Id: Array.h 21545 2015-01-22 19:36:35Z gervandiepen $
26 
27 #ifndef SCIMATH_STATSALGORITHM_H
28 #define SCIMATH_STATSALGORITHM_H
29 
30 #include <casacore/casa/aips.h>
31 #include <casacore/casa/Exceptions/Error.h>
32 #include <casacore/casa/Utilities/CountedPtr.h>
33 #include <casacore/scimath/Mathematics/StatsDataProvider.h>
34 #include <casacore/scimath/Mathematics/StatisticsData.h>
35 #include <casacore/scimath/Mathematics/StatisticsTypes.h>
36 
37 #include <map>
38 #include <set>
39 #include <vector>
40 
41 // because the template signature has become unwieldy
42 #define CASA_STATD template <class AccumType, class DataIterator, class MaskIterator, class WeightsIterator>
43 #define CASA_STATP AccumType, DataIterator, MaskIterator, WeightsIterator
44 
45 namespace casacore {
46 
47 // Base class of statistics algorithm class hierarchy.
48 
49 // The default implementation is such that statistics are only calculated when
50 // getStatistic() or getStatistics() is called. Until then, the iterators which
51 // point to the beginning of data sets, masks, etc. are held in memory. Thus,
52 // the caller must keep all data sets available for the statistics object until
53 // these methods are called, and of course, if the actual data values are changed
54 // between adding data and calculating statistics, the updated values are used when
55 // calculating statistics. Derived classes may override this behavior.
56 //
57 // PRECISION CONSIDERATIONS
58 // Many statistics are computed via accumulators. This can lead to precision issues,
59 // especially for large datasets. For this reason, it is highly recommended that the
60 // data type one uses as the AccumType be of higher precision, if possible, than the
61 // data type pointed to by input iterator. So for example, if one has a data set of
62 // Float values (to which the InputIterator type points to), then one should use type
63 // Double for the AccumType. In this case, the Float data values will be converted to
64 // Doubles before they are accumulated.
65 //
66 // METHODS OF PROVIDING DATA
67 // Data may be provided in one of two mutually exclusive ways. The first way is
68 // simpler, and that is to use the setData()/addData() methods. Calling setData() will
69 // clear any previous data that was added via these methods or via a data provider (see
70 // below). Calling addData() subsequently to setData() will add a data set to the set
71 // of data sets on which statistics will be calculated. In order for this to work
72 // correctly, the iterators which are passed into these methods must still be valid when
73 // statistics are calculated (although note that some derived classes allow certain
74 // statistics to be updated as data sets are added via these methods. See specific
75 // classes for details).
76 //
77 // The second way to provide data is via a data provider. This takes the form of
78 // a derived class of StatsDataProvider, in which various methods are implemented for
79 // retrieving various information about the data sets. Such an interface is necessary for
80 // data which does not easily lend itself to be provided via the setData()/addData()
81 // methods. For example, in the case of iterating through a lattice, a lattice iterator
82 // will overwrite the memory location of the previous chunk of data with the current chunk
83 // of data. Therefore, if one does not wish to load data from the entire lattice into
84 // memory (which is why LatticeIterator was designed in this way), one must the
85 // LatticeStatsDataProvider class, which the statistics framework will use to iteratate
86 // through the lattice, only keeping one chunk of the data of the lattice in memory at one
87 // time.
88 //
89 // QUANTILES
90 // A quantile is a value contained in a data set, such that, it has a zero-based
91 // index of ceil(q*n)-1 in the equivalent ordered dataset, where 0 < q < 1 specifies
92 // the fractional location within the ordered dataset and n is the total number of elements.
93 // Note that, for a dataset with an odd number of elements, the median is the same as
94 // the quantile value when q = 0.5. However, there is no such correspondance between the
95 // median in a dataset with an even number of elements, since the median in that case is
96 // given by the mean of the elements of zero-based indeces n/2-1 and n/2 in the equivalent
97 // ordered dataset. Thus, in the case of a dataset with an even number of values, the
98 // median may not even exist in the dataset, while a quantile value must exist in the
99 // dataset. Note when calculating quantile values, a dataset that does not fall in
100 // specified dataset ranges, is not included via a stride specification, is masked, or
101 // has a weight of zero is not considered a member of the dataset for the pruposes of
102 // quantile calculations.
103 
104 template <class AccumType, class DataIterator, class MaskIterator=const Bool *, class WeightsIterator=DataIterator>
106 
107 public:
108 
109  virtual ~StatisticsAlgorithm();
110 
111  // <group>
112  // Add a dataset to an existing set of datasets on which statistics are
113  // to be calculated. nr is the number of points to be considered.
114  // If <src>dataStride</src> is greater than 1, when <src>nrAccountsForStride</src>=True indicates
115  // that the stride has been taken into account in the value of <src>nr</src>. Otherwise, it has
116  // not so that the actual number of points to include is nr/dataStride if nr % dataStride == 0 or
117  // (int)(nr/dataStride) + 1 otherwise.
118  // If one calls this method after a data provider has been set, an exception will be
119  // thrown. In this case, one should call setData(), rather than addData(), to indicate
120  // that the underlying data provider should be removed.
121  // <src>dataRanges</src> provide the ranges of data to include if <src>isInclude</src> is True,
122  // or ranges of data to exclude if <src>isInclude</src> is False. If a datum equals the end point
123  // of a data range, it is considered good (included) if <src>isInclude</src> is True, and it is
124  // considered bad (excluded) if <src>isInclude</src> is False.
125 
126  virtual void addData(
127  const DataIterator& first, uInt nr, uInt dataStride=1,
128  Bool nrAccountsForStride=False
129  );
130 
131  virtual void addData(
132  const DataIterator& first, uInt nr,
133  const DataRanges& dataRanges, Bool isInclude=True, uInt dataStride=1,
134  Bool nrAccountsForStride=False
135  );
136 
137  virtual void addData(
138  const DataIterator& first, const MaskIterator& maskFirst,
139  uInt nr, uInt dataStride=1, Bool nrAccountsForStride=False, uInt maskStride=1
140  );
141 
142  virtual void addData(
143  const DataIterator& first, const MaskIterator& maskFirst,
144  uInt nr, const DataRanges& dataRanges,
145  Bool isInclude=True, uInt dataStride=1, Bool nrAccountsForStride=False,
146  uInt maskStride=1
147  );
148 
149  virtual void addData(
150  const DataIterator& first, const WeightsIterator& weightFirst,
151  uInt nr, uInt dataStride=1, Bool nrAccountsForStride=False
152  );
153 
154  virtual void addData(
155  const DataIterator& first, const WeightsIterator& weightFirst,
156  uInt nr, const DataRanges& dataRanges,
157  Bool isInclude=True, uInt dataStride=1, Bool nrAccountsForStride=False
158  );
159 
160  virtual void addData(
161  const DataIterator& first, const WeightsIterator& weightFirst,
162  const MaskIterator& maskFirst, uInt nr, uInt dataStride=1,
163  Bool nrAccountsForStride=False,
164  uInt maskStride=1
165  );
166 
167  virtual void addData(
168  const DataIterator& first, const WeightsIterator& weightFirst,
169  const MaskIterator& maskFirst, uInt nr, const DataRanges& dataRanges,
170  Bool isInclude=True, uInt dataStride=1, Bool nrAccountsForStride=False,
171  uInt maskStride=1
172  );
173  // </group>
174 
175  // get the algorithm that this object uses for computing stats
176  virtual StatisticsData::ALGORITHM algorithm() const = 0;
177 
178  // delete any (partially) sorted array
179  void deleteSortedArray();
180 
181  virtual AccumType getMedian(
182  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
183  CountedPtr<AccumType> knownMax=NULL, uInt binningThreshholdSizeBytes=4096*4096,
184  Bool persistSortedArray=False, uInt64 nBins=10000
185  ) = 0;
186 
187  // The return value is the median; the quantiles are returned in the <src>quantileToValue</src> map.
188  virtual AccumType getMedianAndQuantiles(
189  std::map<Double, AccumType>& quantileToValue, const std::set<Double>& quantiles,
190  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
191  CountedPtr<AccumType> knownMax=NULL,
192  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
193  uInt64 nBins=10000
194  ) = 0;
195 
196  // get the median of the absolute deviation about the median of the data.
197  virtual AccumType getMedianAbsDevMed(
198  CountedPtr<uInt64> knownNpts=NULL,
199  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
200  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
201  uInt64 nBins=10000
202  ) = 0;
203 
204  AccumType getQuantile(
205  Double quantile, CountedPtr<uInt64> knownNpts=NULL,
206  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
207  uInt binningThreshholdSizeBytes=4096*4096,
208  Bool persistSortedArray=False, uInt64 nBins=10000
209  );
210 
211  // get a map of quantiles to values.
212  virtual std::map<Double, AccumType> getQuantiles(
213  const std::set<Double>& quantiles, CountedPtr<uInt64> npts=NULL,
215  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
216  uInt64 nBins=10000
217  ) = 0;
218 
219  // get the value of the specified statistic
220  virtual AccumType getStatistic(StatisticsData::STATS stat);
221 
222  // certain statistics such as max and min have locations in the dataset
223  // associated with them. This method gets those locations. The first value
224  // in the returned pair is the zero-based dataset number that was set or
225  // added. The second value is the zero-based index in that dataset. A data stride
226  // of greater than one is not accounted for, so the index represents the actual
227  // location in the data set, independent of the dataStride value.
228  virtual std::pair<Int64, Int64> getStatisticIndex(StatisticsData::STATS stat) = 0;
229 
231 
232  // <group>
233  // setdata() clears any current datasets or data provider and then adds the specified data set as
234  // the first dataset in the (possibly new) set of data sets for which statistics are
235  // to be calculated. See addData() for parameter meanings.
236  virtual void setData(const DataIterator& first, uInt nr, uInt dataStride=1, Bool nrAccountsForStride=False);
237 
238  virtual void setData(
239  const DataIterator& first, uInt nr,
240  const DataRanges& dataRanges, Bool isInclude=True, uInt dataStride=1,
241  Bool nrAccountsForStride=False
242  );
243 
244  virtual void setData(
245  const DataIterator& first, const MaskIterator& maskFirst,
246  uInt nr, uInt dataStride=1, Bool nrAccountsForStride=False,
247  uInt maskStride=1
248  );
249 
250  virtual void setData(
251  const DataIterator& first, const MaskIterator& maskFirst,
252  uInt nr, const DataRanges& dataRanges,
253  Bool isInclude=True, uInt dataStride=1, Bool nrAccountsForStride=False,
254  uInt maskStride=1
255  );
256 
257  virtual void setData(
258  const DataIterator& first, const WeightsIterator& weightFirst,
259  uInt nr, uInt dataStride=1,
260  Bool nrAccountsForStride=False
261  );
262 
263  virtual void setData(
264  const DataIterator& first, const WeightsIterator& weightFirst,
265  uInt nr, const DataRanges& dataRanges,
266  Bool isInclude=True, uInt dataStride=1,
267  Bool nrAccountsForStride=False
268  );
269 
270  virtual void setData(
271  const DataIterator& first, const WeightsIterator& weightFirst,
272  const MaskIterator& maskFirst, uInt nr, uInt dataStride=1,
273  Bool nrAccountsForStride=False,
274  uInt maskStride=1
275  );
276 
277  virtual void setData(
278  const DataIterator& first, const WeightsIterator& weightFirst,
279  const MaskIterator& maskFirst, uInt nr, const DataRanges& dataRanges,
280  Bool isInclude=True, uInt dataStride=1, Bool nrAccountsForStride=False,
281  uInt maskStride=1
282  );
283  // </group>
284 
285  // instead of settng and adding data "by hand", set the data provider that will provide
286  // all the data sets. Calling this method will clear any other data sets that have
287  // previously been set or added.
288  virtual void setDataProvider(StatsDataProvider<CASA_STATP> *dataProvider) {
289  ThrowIf(! dataProvider, "Logic Error: data provider cannot be NULL");
290  _clearData();
291  _dataProvider = dataProvider;
292  }
293 
294  // Provide guidance to algorithms by specifying a priori which statistics the
295  // caller would like calculated.
296  virtual void setStatsToCalculate(std::set<StatisticsData::STATS>& stats);
297 
298 protected:
300 
301  // use copy semantics
304  );
305 
306  // Allows derived classes to do things after data is set or added.
307  // Default implementation does nothing.
308  virtual void _addData() {}
309 
310  virtual void _clearData();
311 
312  const vector<Int64>& _getCounts() const { return _counts; }
313 
314  const vector<DataIterator>& _getData() const { return _data; }
315 
317  return _dataProvider;
318  }
319 
320  const vector<uInt>& _getDataStrides() const { return _dataStrides; }
321 
322  const std::map<uInt, Bool>& _getIsIncludeRanges() const { return _isIncludeRanges; }
323 
324  const std::map<uInt, MaskIterator> _getMasks() const { return _masks; }
325 
326  const std::map<uInt, uInt>& _getMaskStrides() const { return _maskStrides; }
327 
328  const std::map<uInt, DataRanges>& _getRanges() const { return _dataRanges; }
329 
330  virtual AccumType _getStatistic(StatisticsData::STATS stat) = 0;
331 
332  virtual StatsData<AccumType> _getStatistics() = 0;
333 
334  const std::set<StatisticsData::STATS> _getStatsToCalculate() const {
335  return _statsToCalculate;
336  }
337 
338  std::vector<AccumType>& _getSortedArray() { return _sortedArray; }
339 
340  virtual const std::set<StatisticsData::STATS>& _getUnsupportedStatistics() const {
341  return _unsupportedStats;
342  }
343 
344  const std::map<uInt, WeightsIterator>& _getWeights() const {
345  return _weights;
346  }
347 
348  /*
349  // get the zero-based indices of the specified quantiles in sorted dataset with npts
350  // number of good points. The returned map maps quantiles to indices.
351  static std::map<Double, uInt64> _indicesFromQuantiles(
352  uInt64 npts, const std::set<Double>& quantiles
353  );
354  */
355 
356  // The array can be changed by paritally sorting it up to the largest index. Return
357  // a map of index to value in the sorted array.
358  static std::map<uInt64, AccumType> _valuesFromArray(
359  vector<AccumType>& myArray, const std::set<uInt64>& indices
360  );
361 
362  void _setSortedArray(const vector<AccumType>& v) { _sortedArray = v; }
363 
364 private:
365  vector<DataIterator> _data;
366  // maps data to weights
367  std::map<uInt, WeightsIterator> _weights;
368  // maps data to masks
369  std::map<uInt, MaskIterator> _masks;
370  vector<Int64> _counts;
371  vector<uInt> _dataStrides;
372  std::map<uInt, uInt> _maskStrides;
373  std::map<uInt, Bool> _isIncludeRanges;
374  std::map<uInt, DataRanges> _dataRanges;
375  vector<AccumType> _sortedArray;
376  std::set<StatisticsData::STATS> _statsToCalculate, _unsupportedStats;
378 
379  void _throwIfDataProviderDefined() const;
380 };
381 
382 }
383 
384 #ifndef CASACORE_NO_AUTO_TEMPLATES
385 #include <casacore/scimath/Mathematics/StatisticsAlgorithm.tcc>
386 #endif
387 
388 #endif
virtual void setStatsToCalculate(std::set< StatisticsData::STATS > &stats)
Provide guidance to algorithms by specifying a priori which statistics the caller would like calculat...
std::vector< AccumType > & _getSortedArray()
StatisticsAlgorithm< CASA_STATP > & operator=(const StatisticsAlgorithm< CASA_STATP > &other)
use copy semantics
virtual void _addData()
Allows derived classes to do things after data is set or added.
const std::map< uInt, uInt > & _getMaskStrides() const
std::map< uInt, DataRanges > _dataRanges
virtual void addData(const DataIterator &first, uInt nr, uInt dataStride=1, Bool nrAccountsForStride=False)
Add a dataset to an existing set of datasets on which statistics are to be calculated.
std::map< uInt, WeightsIterator > _weights
maps data to weights
virtual StatsData< AccumType > _getStatistics()=0
struct Node * first
Definition: malloc.h:330
LatticeExprNode max(const LatticeExprNode &left, const LatticeExprNode &right)
std::map< uInt, MaskIterator > _masks
maps data to masks
unsigned long long uInt64
Definition: aipsxtype.h:39
virtual AccumType getStatistic(StatisticsData::STATS stat)
get the value of the specified statistic
virtual AccumType _getStatistic(StatisticsData::STATS stat)=0
virtual AccumType getMedianAndQuantiles(std::map< Double, AccumType > &quantileToValue, const std::set< Double > &quantiles, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)=0
The return value is the median; the quantiles are returned in the quantileToValue map...
virtual AccumType getMedianAbsDevMed(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)=0
get the median of the absolute deviation about the median of the data.
void _setSortedArray(const vector< AccumType > &v)
std::map< uInt, Bool > _isIncludeRanges
ALGORITHM
implemented algorithms
const std::map< uInt, MaskIterator > _getMasks() const
Referenced counted pointer for constant data.
Definition: CountedPtr.h:86
LatticeExprNode min(const LatticeExprNode &left, const LatticeExprNode &right)
virtual void setDataProvider(StatsDataProvider< CASA_STATP > *dataProvider)
instead of settng and adding data "by hand", set the data provider that will provide all the data set...
StatsDataProvider< CASA_STATP > * _getDataProvider()
double Double
Definition: aipstype.h:55
virtual StatsData< AccumType > getStatistics()
std::map< uInt, uInt > _maskStrides
StatsDataProvider< CASA_STATP > * _dataProvider
const vector< DataIterator > & _getData() const
virtual StatisticsData::ALGORITHM algorithm() const =0
get the algorithm that this object uses for computing stats
#define ThrowIf(c, m)
Throw an AipsError exception if the condition is true.
Definition: Error.h:90
#define DataRanges
Commonly used types in statistics framework.
bool Bool
Define the standard types used by Casacore.
Definition: aipstype.h:42
virtual const std::set< StatisticsData::STATS > & _getUnsupportedStatistics() const
void _throwIfDataProviderDefined() const
const std::map< uInt, DataRanges > & _getRanges() const
const std::set< StatisticsData::STATS > _getStatsToCalculate() const
const Bool False
Definition: aipstype.h:44
const std::map< uInt, WeightsIterator > & _getWeights() const
virtual AccumType getMedian(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)=0
const vector< uInt > & _getDataStrides() const
virtual std::pair< Int64, Int64 > getStatisticIndex(StatisticsData::STATS stat)=0
certain statistics such as max and min have locations in the dataset associated with them...
void deleteSortedArray()
delete any (partially) sorted array
virtual void setData(const DataIterator &first, uInt nr, uInt dataStride=1, Bool nrAccountsForStride=False)
setdata() clears any current datasets or data provider and then adds the specified data set as the fi...
std::set< StatisticsData::STATS > _statsToCalculate
std::set< StatisticsData::STATS > _unsupportedStats
*static std::map< uInt64, AccumType > _valuesFromArray(vector< AccumType > &myArray, const std::set< uInt64 > &indices)
The array can be changed by paritally sorting it up to the largest index.
AccumType getQuantile(Double quantile, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
const std::map< uInt, Bool > & _getIsIncludeRanges() const
const vector< Int64 > & _getCounts() const
Base class of statistics algorithm class hierarchy.
const Bool True
Definition: aipstype.h:43
this file contains all the compiler specific defines
Definition: mainpage.dox:28
virtual std::map< Double, AccumType > getQuantiles(const std::set< Double > &quantiles, CountedPtr< uInt64 > npts=NULL, CountedPtr< AccumType > min=NULL, CountedPtr< AccumType > max=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)=0
get a map of quantiles to values.
unsigned int uInt
Definition: aipstype.h:51