Home  · Classes  · Annotated Classes  · Modules  · Members  · Namespaces  · Related Pages
StatisticFunctions.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2015.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Clemens Groepl $
32 // $Authors: Clemens Groepl, Johannes Junker, Mathias Walzer$
33 // --------------------------------------------------------------------------
34 #ifndef OPENMS_MATH_STATISTICS_STATISTICFUNCTIONS_H
35 #define OPENMS_MATH_STATISTICS_STATISTICFUNCTIONS_H
36 
37 #include <vector>
39 #include <OpenMS/CONCEPT/Types.h>
40 
41 #include <boost/accumulators/accumulators.hpp>
42 #include <boost/accumulators/statistics/covariance.hpp>
43 #include <boost/accumulators/statistics/mean.hpp>
44 #include <boost/accumulators/statistics/stats.hpp>
45 #include <boost/accumulators/statistics/variance.hpp>
46 #include <boost/accumulators/statistics/variates/covariate.hpp>
47 #include <boost/function/function_base.hpp>
48 #include <boost/lambda/casts.hpp>
49 #include <boost/lambda/lambda.hpp>
50 
51 #include <iterator>
52 #include <algorithm>
53 
54 
55 
56 using std::iterator_traits;
57 
58 namespace OpenMS
59 {
60 
61  namespace Math
62  {
70  template <typename IteratorType>
71  static void checkIteratorsNotNULL(
72  IteratorType begin, IteratorType end)
73  {
74  if (begin == end)
75  {
76  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
77  }
78  }
86  template <typename IteratorType>
87  static void checkIteratorsEqual(
88  IteratorType begin, IteratorType end)
89  {
90  if (begin != end)
91  {
92  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
93  }
94  }
102  template <typename IteratorType1, typename IteratorType2>
104  IteratorType1 begin_b, IteratorType1 end_b,
105  IteratorType2 begin_a, IteratorType2 end_a)
106  {
107  if(begin_b != end_b && begin_a == end_a)
108  {
109  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
110  }
111  }
117  template <typename IteratorType>
118  static double sum(IteratorType begin, IteratorType end)
119  {
120  return std::accumulate(begin, end, 0.0);
121  }
122 
130  template <typename IteratorType>
131  static double mean(IteratorType begin, IteratorType end)
132  {
133  checkIteratorsNotNULL(begin, end);
134  return sum(begin, end) / std::distance(begin, end);
135  }
136 
148  template <typename IteratorType>
149  static double median(IteratorType begin, IteratorType end, bool sorted = false)
150  {
151  checkIteratorsNotNULL(begin, end);
152  if (!sorted)
153  {
154  std::sort(begin, end);
155  }
156 
157  Size size = std::distance(begin, end);
158  if (size % 2 == 0) // even size => average two middle values
159  {
160  IteratorType it1 = begin;
161  std::advance(it1, size / 2 - 1);
162  IteratorType it2 = it1;
163  std::advance(it2, 1);
164  return (*it1 + *it2) / 2.0;
165  }
166  else
167  {
168  IteratorType it = begin;
169  std::advance(it, (size - 1) / 2);
170  return *it;
171  }
172  }
173 
187  template <typename IteratorType>
188  static double quantile1st(
189  IteratorType begin, IteratorType end, bool sorted = false)
190  {
191  checkIteratorsNotNULL(begin, end);
192 
193  if (!sorted)
194  {
195  std::sort(begin, end);
196  }
197 
198  Size size = std::distance(begin, end);
199  if (size % 2 == 0)
200  {
201  return median(begin, begin + (size/2)-1, true); //-1 to exclude median values
202  }
203  return median(begin, begin + (size/2), true);
204  }
205 
219  template <typename IteratorType>
220  static double quantile3rd(
221  IteratorType begin, IteratorType end, bool sorted = false)
222  {
223  checkIteratorsNotNULL(begin, end);
224  if (!sorted)
225  {
226  std::sort(begin, end);
227  }
228 
229  Size size = std::distance(begin, end);
230  return median(begin + (size/2)+1, end, true); //+1 to exclude median values
231  }
232 
240  template <typename IteratorType>
241  static double variance(
242  IteratorType begin, IteratorType end,
243  double mean = std::numeric_limits<double>::max())
244  {
245  checkIteratorsNotNULL(begin, end);
246  double sum = 0.0;
247  if (mean == std::numeric_limits<double>::max())
248  {
249  mean = Math::mean(begin, end);
250  }
251  for (IteratorType iter=begin; iter!=end; ++iter)
252  {
253  double diff = *iter - mean;
254  sum += diff * diff;
255  }
256  return sum / (std::distance(begin, end)-1);
257  }
258 
266  template <typename IteratorType>
267  static double sd(
268  IteratorType begin, IteratorType end,
269  double mean = std::numeric_limits<double>::max())
270  {
271  checkIteratorsNotNULL(begin, end);
272  return std::sqrt( variance(begin, end, mean) );
273  }
274 
282  template <typename IteratorType>
283  static double absdev(
284  IteratorType begin, IteratorType end,
285  double mean = std::numeric_limits<double>::max())
286  {
287  checkIteratorsNotNULL(begin, end);
288  double sum = 0.0;
289  if (mean == std::numeric_limits<double>::max())
290  {
291  mean = Math::mean(begin, end);
292  }
293  for (IteratorType iter=begin; iter!=end; ++iter)
294  {
295  sum += *iter - mean;
296  }
297  return sum / std::distance(begin, end);
298  }
299 
309  template <typename IteratorType1, typename IteratorType2>
310  static double covariance(
311  IteratorType1 begin_a, IteratorType1 end_a,
312  IteratorType2 begin_b, IteratorType2 end_b)
313  {
314  //no data or different lengths
315  checkIteratorsNotNULL(begin_a, end_a);
316 
317  double sum = 0.0;
318  double mean_a = Math::mean(begin_a, end_a);
319  double mean_b = Math::mean(begin_b, end_b);
320  IteratorType1 iter_a = begin_a;
321  IteratorType2 iter_b = begin_b;
322  for (; iter_a != end_a; ++iter_a, ++iter_b)
323  {
324  /* assure both ranges have the same number of elements */
325  checkIteratorsAreValid(begin_b, end_b, begin_a, end_a);
326  sum += (*iter_a - mean_a) * (*iter_b - mean_b);
327  }
328  /* assure both ranges have the same number of elements */
329  checkIteratorsEqual(iter_b, end_b);
330  Size n = std::distance(begin_a, end_a);
331  return sum / (n-1);
332  }
333 
334 
335 
336 
346  template <typename IteratorType1, typename IteratorType2>
347  static double meanSquareError(
348  IteratorType1 begin_a, IteratorType1 end_a,
349  IteratorType2 begin_b, IteratorType2 end_b)
350  {
351  //no data or different lengths
352  checkIteratorsNotNULL(begin_a, end_a);
353 
354  SignedSize dist = std::distance(begin_a, end_a);
355  double error = 0;
356  IteratorType1 iter_a = begin_a;
357  IteratorType2 iter_b = begin_b;
358  for (; iter_a != end_a; ++iter_a, ++iter_b)
359  {
360  /* assure both ranges have the same number of elements */
361  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
362 
363  double tmp(*iter_a - *iter_b);
364  error += tmp * tmp;
365  }
366  /* assure both ranges have the same number of elements */
367  checkIteratorsEqual(iter_b, end_b);
368 
369  return error / dist;
370  }
371 
381  template <typename IteratorType1, typename IteratorType2>
382  static double classificationRate(
383  IteratorType1 begin_a, IteratorType1 end_a,
384  IteratorType2 begin_b, IteratorType2 end_b)
385  {
386  //no data or different lengths
387  checkIteratorsNotNULL(begin_a, end_a);
388 
389  SignedSize dist = std::distance(begin_a, end_a);
390  SignedSize correct = dist;
391  IteratorType1 iter_a = begin_a;
392  IteratorType2 iter_b = begin_b;
393  for (; iter_a != end_a; ++iter_a, ++iter_b)
394  {
395  /* assure both ranges have the same number of elements */
396  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
397  if ((*iter_a < 0 && *iter_b >= 0) || (*iter_a >= 0 && *iter_b < 0))
398  {
399  --correct;
400  }
401 
402  }
403  /* assure both ranges have the same number of elements */
404  checkIteratorsEqual(iter_b, end_b);
405 
406  return double(correct) / dist;
407  }
408 
421  template <typename IteratorType1, typename IteratorType2>
423  IteratorType1 begin_a, IteratorType1 end_a,
424  IteratorType2 begin_b, IteratorType2 end_b)
425  {
426  //no data or different lengths
427  checkIteratorsNotNULL(begin_a, end_b);
428 
429  double tp = 0;
430  double fp = 0;
431  double tn = 0;
432  double fn = 0;
433  IteratorType1 iter_a = begin_a;
434  IteratorType2 iter_b = begin_b;
435  for (; iter_a != end_a; ++iter_a, ++iter_b)
436  {
437  /* assure both ranges have the same number of elements */
438  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
439 
440  if (*iter_a < 0 && *iter_b >= 0)
441  {
442  ++fn;
443  }
444  else if (*iter_a < 0 && *iter_b < 0)
445  {
446  ++tn;
447  }
448  else if (*iter_a >= 0 && *iter_b >= 0)
449  {
450  ++tp;
451  }
452  else if (*iter_a >= 0 && *iter_b < 0)
453  {
454  ++fp;
455  }
456  }
457  /* assure both ranges have the same number of elements */
458  checkIteratorsEqual(iter_b, end_b);
459 
460  return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn));
461  }
462 
474  template <typename IteratorType1, typename IteratorType2>
476  IteratorType1 begin_a, IteratorType1 end_a,
477  IteratorType2 begin_b, IteratorType2 end_b)
478  {
479  //no data or different lengths
480  checkIteratorsNotNULL(begin_a, end_a);
481 
482  //calculate average
483  SignedSize dist = std::distance(begin_a, end_a);
484  double avg_a = std::accumulate(begin_a, end_a, 0.0) / dist;
485  double avg_b = std::accumulate(begin_b, end_b, 0.0) / dist;
486 
487  double numerator = 0;
488  double denominator_a = 0;
489  double denominator_b = 0;
490  IteratorType1 iter_a = begin_a;
491  IteratorType2 iter_b = begin_b;
492  for (; iter_a != end_a; ++iter_a, ++iter_b)
493  {
494  /* assure both ranges have the same number of elements */
495  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
496  double temp_a = *iter_a - avg_a;
497  double temp_b = *iter_b - avg_b;
498  numerator += (temp_a * temp_b);
499  denominator_a += (temp_a * temp_a);
500  denominator_b += (temp_b * temp_b);
501  }
502  /* assure both ranges have the same number of elements */
503  checkIteratorsEqual(iter_b, end_b);
504  return numerator / sqrt(denominator_a * denominator_b);
505  }
506 
508  template <typename Value>
509  static void computeRank(std::vector<Value> & w)
510  {
511  Size i = 0; // main index
512  Size z = 0; // "secondary" index
513  Value rank = 0;
514  Size n = (w.size() - 1);
515  //store original indices for later
516  std::vector<std::pair<Size, Value> > w_idx;
517  for (Size j = 0; j < w.size(); ++j)
518  {
519  w_idx.push_back(std::make_pair(j, w[j]));
520  }
521  //sort
522  std::sort(w_idx.begin(), w_idx.end(),
523  boost::lambda::ret<bool>((&boost::lambda::_1->*& std::pair<Size, Value>::second) <
524  (&boost::lambda::_2->*& std::pair<Size, Value>::second)));
525  //replace pairs <orig_index, value> in w_idx by pairs <orig_index, rank>
526  while (i < n)
527  {
528  // test for equality with tolerance:
529  if (fabs(w_idx[i + 1].second - w_idx[i].second) > 0.0000001 * fabs(w_idx[i + 1].second)) // no tie
530  {
531  w_idx[i].second = Value(i + 1);
532  ++i;
533  }
534  else // tie, replace by mean rank
535  {
536  // count number of ties
537  for (z = i + 1; (z <= n) && fabs(w_idx[z].second - w_idx[i].second) <= 0.0000001 * fabs(w_idx[z].second); ++z)
538  {
539  }
540  // compute mean rank of tie
541  rank = 0.5 * (i + z + 1);
542  // replace intensities by rank
543  for (Size v = i; v <= z - 1; ++v)
544  {
545  w_idx[v].second = rank;
546  }
547  i = z;
548  }
549  }
550  if (i == n)
551  w_idx[n].second = Value(n + 1);
552  //restore original order and replace elements of w with their ranks
553  for (Size j = 0; j < w.size(); ++j)
554  {
555  w[w_idx[j].first] = w_idx[j].second;
556  }
557  }
558 
570  template <typename IteratorType1, typename IteratorType2>
572  IteratorType1 begin_a, IteratorType1 end_a,
573  IteratorType2 begin_b, IteratorType2 end_b)
574  {
575  //no data or different lengths
576  checkIteratorsNotNULL(begin_a, end_a);
577 
578  // store and sort intensities of model and data
579  SignedSize dist = std::distance(begin_a, end_a);
580  std::vector<double> ranks_data;
581  ranks_data.reserve(dist);
582  std::vector<double> ranks_model;
583  ranks_model.reserve(dist);
584  IteratorType1 iter_a = begin_a;
585  IteratorType2 iter_b = begin_b;
586  for (; iter_a != end_a; ++iter_a, ++iter_b)
587  {
588  /* assure both ranges have the same number of elements */
589  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
590 
591  ranks_model.push_back(*iter_a);
592  ranks_data.push_back(*iter_b);
593  }
594  /* assure both ranges have the same number of elements */
595  checkIteratorsEqual(iter_b, end_b);
596 
597  // replace entries by their ranks
598  computeRank(ranks_data);
599  computeRank(ranks_model);
600 
601  double mu = double(ranks_data.size() + 1) / 2.; // mean of ranks
602  // Was the following, but I think the above is more correct ... (Clemens)
603  // double mu = (ranks_data.size() + 1) / 2;
604 
605  double sum_model_data = 0;
606  double sqsum_data = 0;
607  double sqsum_model = 0;
608 
609  for (Int i = 0; i < dist; ++i)
610  {
611  sum_model_data += (ranks_data[i] - mu) * (ranks_model[i] - mu);
612  sqsum_data += (ranks_data[i] - mu) * (ranks_data[i] - mu);
613  sqsum_model += (ranks_model[i] - mu) * (ranks_model[i] - mu);
614  }
615 
616  // check for division by zero
617  if (!sqsum_data || !sqsum_model)
618  {
619  return 0;
620  }
621 
622  return sum_model_data / (sqrt(sqsum_data) * sqrt(sqsum_model));
623  }
624 
625  } // namespace Math
626 } // namespace OpenMS
627 
628 #endif // OPENMS_MATH_STATISTICS_STATISTICFUNCTIONS_H
static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the mean square error for the values in [begin_a, end_a) and [begin_b, end_b) ...
Definition: StatisticFunctions.h:347
static double variance(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the variance of a range of values.
Definition: StatisticFunctions.h:241
static double sum(IteratorType begin, IteratorType end)
Calculates the sum of a range of values.
Definition: StatisticFunctions.h:118
static void checkIteratorsAreValid(IteratorType1 begin_b, IteratorType1 end_b, IteratorType2 begin_a, IteratorType2 end_a)
Helper function checking if an iterator and a co-iterator both have a next element.
Definition: StatisticFunctions.h:103
static double quantile1st(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the first quantile of a range of values.
Definition: StatisticFunctions.h:188
static void computeRank(std::vector< Value > &w)
Replaces the elements in vector w by their ranks.
Definition: StatisticFunctions.h:509
static double covariance(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the covariance of two ranges of values.
Definition: StatisticFunctions.h:310
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:128
static void checkIteratorsEqual(IteratorType begin, IteratorType end)
Helper function checking if two iterators are equal.
Definition: StatisticFunctions.h:87
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
static double mean(IteratorType begin, IteratorType end)
Calculates the mean of a range of values.
Definition: StatisticFunctions.h:131
static double sd(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the standard deviation of a range of values.
Definition: StatisticFunctions.h:267
static double matthewsCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Matthews correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:422
static double absdev(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the absolute deviation of a range of values.
Definition: StatisticFunctions.h:283
static double pearsonCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Pearson correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:475
static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
Helper function checking if two iterators are not equal.
Definition: StatisticFunctions.h:71
static double quantile3rd(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the third quantile of a range of values.
Definition: StatisticFunctions.h:220
static double median(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the median of a range of values.
Definition: StatisticFunctions.h:149
Invalid range exception.
Definition: Exception.h:286
static double rankCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
calculates the rank correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:571
static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the classification rate for the values in [begin_a, end_a) and [begin_b, end_b)
Definition: StatisticFunctions.h:382
int Int
Signed integer type.
Definition: Types.h:96

OpenMS / TOPP release 2.0.0 Documentation generated on Tue Nov 1 2016 16:34:46 using doxygen 1.8.11