casacore
ClassicalStatistics.h
Go to the documentation of this file.
1 //# Copyright (C) 2000,2001
2 //# Associated Universities, Inc. Washington DC, USA.
3 //#
4 //# This library is free software; you can redistribute it and/or modify it
5 //# under the terms of the GNU Library General Public License as published by
6 //# the Free Software Foundation; either version 2 of the License, or (at your
7 //# option) any later version.
8 //#
9 //# This library is distributed in the hope that it will be useful, but WITHOUT
10 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
12 //# License for more details.
13 //#
14 //# You should have received a copy of the GNU Library General Public License
15 //# along with this library; if not, write to the Free Software Foundation,
16 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
17 //#
18 //# Correspondence concerning AIPS++ should be addressed as follows:
19 //# Internet email: aips2-request@nrao.edu.
20 //# Postal address: AIPS++ Project Office
21 //# National Radio Astronomy Observatory
22 //# 520 Edgemont Road
23 //# Charlottesville, VA 22903-2475 USA
24 //#
25 //# $Id: Array.h 21545 2015-01-22 19:36:35Z gervandiepen $
26 
27 #ifndef SCIMATH_CLASSICALSTATS_H
28 #define SCIMATH_CLASSICALSTATS_H
29 
30 #include <casacore/casa/aips.h>
31 
32 #include <casacore/scimath/Mathematics/StatisticsAlgorithm.h>
33 
34 #include <casacore/scimath/Mathematics/StatisticsTypes.h>
35 #include <casacore/scimath/Mathematics/StatisticsUtilities.h>
36 
37 #include <set>
38 #include <vector>
39 #include <utility>
40 
41 namespace casacore {
42 
43 template <class T> class PtrHolder;
44 
45 // Class to calculate statistics in a "classical" sense, ie using accumulators with no
46 // special filtering beyond optional range filtering etc.
47 //
48 // setCalculateAsAdded() allows one to specify if statistics should be calculated and updated
49 // on upon each call to set/addData(). If False, statistics will be calculated only when
50 // getStatistic(), getStatistics(), or similar methods are called. Setting this value to True
51 // allows the caller to not have to keep all the data accessible at once. Note however, that all
52 // data must be simultaneously accessible if quantile (eg median) calculations are desired.
53 
54 // I attempted to write this class using the Composite design pattern, with eg the
55 // _unweightedStats() and _weightedStats() methods in their own class, but for reasons I
56 // don't understand, that impacted performance significantly. So I'm using the current
57 // architecture, which I know is a bit a maintenance nightmare.
58 
59 template <class AccumType, class DataIterator, class MaskIterator=const Bool*, class WeightsIterator=DataIterator>
61  : public StatisticsAlgorithm<AccumType, DataIterator, MaskIterator, WeightsIterator> {
62 public:
63 
65 
66  // copy semantics
68 
69  virtual ~ClassicalStatistics();
70 
71  // copy semantics
74  );
75 
76  // get the algorithm that this object uses for computing stats
79  };
80 
81  // <group>
82  // In the following group of methods, if the size of the composite dataset
83  // is smaller than
84  // <src>binningThreshholdSizeBytes</src>, the composite dataset
85  // will be (perhaps partially) sorted and persisted in memory during the
86  // call. In that case, and if <src>persistSortedArray</src> is True, this
87  // sorted array will remain in memory after the call and will be used on
88  // subsequent calls of this method when <src>binningThreshholdSizeBytes</src>
89  // is greater than the size of the composite dataset. If
90  // <src>persistSortedArray</src> is False, the sorted array will not be
91  // stored after this call completes and so any subsequent calls for which the
92  // dataset size is less than <src>binningThreshholdSizeBytes</src>, the
93  // dataset will be sorted from scratch. Values which are not included due to
94  // non-unity strides, are not included in any specified ranges, are masked,
95  // or have associated weights of zero are not considered as dataset members
96  // for quantile computations.
97  // If one has a priori information regarding the number of points (npts) and/or
98  // the minimum and maximum values of the data set, these can be supplied to
99  // improve performance. Note however, that if these values are not correct, the
100  // resulting median
101  // and/or quantile values will also not be correct (although see the following notes regarding
102  // max/min). Note that if this object has already had getStatistics()
103  // called, and the min and max were calculated, there is no need to pass these values in
104  // as they have been stored internally and used (although passing them in shouldn't hurt
105  // anything). If provided, npts, the number of points falling in the specified ranges which are
106  // not masked and have weights > 0, should be exactly correct. <src>min</src> can be less than
107  // the true minimum, and <src>max</src> can be greater than the True maximum, but for best
108  // performance, these should be as close to the actual min and max as possible.
109  // In order for quantile computations to occur over multiple datasets, all datasets
110  // must be available. This means that if setCalculateAsAdded()
111  // was previously called by passing in a value of True, these methods will throw
112  // an exception as the previous call indicates that there is no guarantee that
113  // all datasets will be available. If one uses a data provider (by having called
114  // setDataProvider()), then this should not be an issue.
115 
116  // get the median of the distribution.
117  // For a dataset with an odd number of good points, the median is just the value
118  // at index int(N/2) in the equivalent sorted dataset, where N is the number of points.
119  // For a dataset with an even number of points, the median is the mean of the values at
120  // indices int(N/2)-1 and int(N/2) in the sorted dataset.
121  // <src>nBins</src> is the number of bins, per histogram, to use to bin the data. More
122  // bins decrease the likelihood that multiple passes of the data set will be necessary, but
123  // also increase the amount of memory used. If nBins is set to less than 1,000, it is
124  // automatically increased to 1,000; there should be no reason to ever set nBins to be
125  // this small.
126  virtual AccumType getMedian(
127  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
128  CountedPtr<AccumType> knownMax=NULL, uInt binningThreshholdSizeBytes=4096*4096,
129  Bool persistSortedArray=False, uInt64 nBins=10000
130  );
131 
132  // If one needs to compute both the median and quantile values, it is better to call
133  // getMedianAndQuantiles() rather than getMedian() and getQuantiles() separately, as the
134  // first will scan large data sets fewer times than calling the separate methods.
135  // The return value is the median; the quantiles are returned in the <src>quantiles</src> map.
136  // Values in the <src>fractions</src> set represent the locations in the CDF and should be
137  // between 0 and 1, exclusive.
138  virtual AccumType getMedianAndQuantiles(
139  std::map<Double, AccumType>& quantiles, const std::set<Double>& fractions,
140  CountedPtr<uInt64> knownNpts=NULL, CountedPtr<AccumType> knownMin=NULL,
141  CountedPtr<AccumType> knownMax=NULL,
142  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
143  uInt64 nBins=10000
144  );
145 
146  // get the median of the absolute deviation about the median of the data.
147  virtual AccumType getMedianAbsDevMed(
148  CountedPtr<uInt64> knownNpts=NULL,
149  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
150  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
151  uInt64 nBins=10000
152  );
153 
154  // Get the specified quantiles. <src>fractions</src> must be between 0 and 1,
155  // noninclusive.
156  virtual std::map<Double, AccumType> getQuantiles(
157  const std::set<Double>& fractions, CountedPtr<uInt64> knownNpts=NULL,
158  CountedPtr<AccumType> knownMin=NULL, CountedPtr<AccumType> knownMax=NULL,
159  uInt binningThreshholdSizeBytes=4096*4096, Bool persistSortedArray=False,
160  uInt64 nBins=10000
161  );
162 
163  // </group>
164 
165  // scan the dataset(s) that have been added, and find the min and max.
166  // This method may be called even if setStatsToCaclulate has been called and
167  // MAX and MIN has been excluded. If setCalculateAsAdded(True) has previously been
168  // called after this object has been (re)initialized, an exception will be thrown.
169  virtual void getMinMax(AccumType& mymin, AccumType& mymax);
170 
171  // scan the dataset(s) that have been added, and find the number of good points.
172  // This method may be called even if setStatsToCaclulate has been called and
173  // NPTS has been excluded. If setCalculateAsAdded(True) has previously been
174  // called after this object has been (re)initialized, an exception will be thrown.
175  virtual uInt64 getNPts();
176 
177  // see base class description
178  virtual std::pair<Int64, Int64> getStatisticIndex(StatisticsData::STATS stat);
179 
180  // Has any data been added to this object? Will return False if the object has
181  // been reset and no data have been added afterward.
182  Bool hasData() const { return _hasData; }
183 
184  // reset object to initial state. Clears all private fields including data,
185  // accumulators, etc.
186  virtual void reset();
187 
188  // Should statistics be updated with calls to addData or should they only be calculated
189  // upon calls to getStatistics etc? Beware that calling this will automatically reinitialize
190  // the object, so that it will contain no references to data et al. after this method has
191  // been called.
192  virtual void setCalculateAsAdded(Bool c);
193 
194  // An exception will be thrown if setCalculateAsAdded(True) has been called.
196 
197  void setStatsToCalculate(std::set<StatisticsData::STATS>& stats);
198 
199 protected:
200 
201  // <group>
202  // scan through the data set to determine the number of good (unmasked, weight > 0,
203  // within range) points. The first with no mask, no
204  // ranges, and no weights is trivial with npts = nr in this class, but is implemented here
205  // so that derived classes may override it.
206  inline virtual void _accumNpts(
207  uInt64& npts,
208  const DataIterator& dataBegin, Int64 nr, uInt dataStride
209  ) const;
210 
211  virtual void _accumNpts(
212  uInt64& npts,
213  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
214  const DataRanges& ranges, Bool isInclude
215  ) const;
216 
217  virtual void _accumNpts(
218  uInt64& npts,
219  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
220  const MaskIterator& maskBegin, uInt maskStride
221  ) const;
222 
223  virtual void _accumNpts(
224  uInt64& npts,
225  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
226  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
227  Bool isInclude
228  ) const;
229 
230  virtual void _accumNpts(
231  uInt64& npts,
232  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
233  Int64 nr, uInt dataStride
234  ) const;
235 
236  virtual void _accumNpts(
237  uInt64& npts,
238  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
239  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
240  ) const;
241 
242  virtual void _accumNpts(
243  uInt64& npts,
244  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
245  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
246  const DataRanges& ranges, Bool isInclude
247  ) const;
248 
249  virtual void _accumNpts(
250  uInt64& npts,
251  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
252  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
253  ) const;
254  // </group>
255 
256  // <group>
257  inline void _accumulate(
258  StatsData<AccumType>& stats, const AccumType& datum,
259  const LocationType& location
260  );
261 
262  inline void _accumulate(
263  StatsData<AccumType>& stats, const AccumType& datum,
264  const AccumType& weight, const LocationType& location
265  );
266  // </group>
267 
268  void _addData();
269 
270  void _clearData();
271 
272  void _clearStats();
273 
274  // scan dataset(s) to find min and max
275  void _doMinMax(AccumType& vmin, AccumType& vmax);
276 
277  // <group>
278  // Get the counts of data within the specified histogram bins. The number of
279  // arrays within binCounts will be equal to the number of histograms in <src>binDesc</src>.
280  // Each array within <src>binCounts</src> will have the same number of elements as the
281  // number of bins in its corresponding histogram in <src>binDesc</src>.
282  virtual void _findBins(
283  vector<vector<uInt64> >& binCounts,
284  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
285  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
286  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc,
287  const vector<AccumType>& maxLimit
288  ) const;
289 
290  virtual void _findBins(
291  vector<vector<uInt64> >& binCounts,
292  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
293  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
294  const DataRanges& ranges, Bool isInclude,
295  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
296  ) const;
297 
298  virtual void _findBins(
299  vector<vector<uInt64> >& binCounts,
300  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
301  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
302  const MaskIterator& maskBegin, uInt maskStride,
303  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
304  ) const;
305 
306  virtual void _findBins(
307  vector<vector<uInt64> >& binCounts,
308  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
309  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
310  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
311  Bool isInclude,
312  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
313  ) const;
314 
315  virtual void _findBins(
316  vector<vector<uInt64> >& binCounts,
317  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
318  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
319  Int64 nr, uInt dataStride,
320  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
321  ) const ;
322 
323  virtual void _findBins(
324  vector<vector<uInt64> >& binCounts,
325  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
326  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
327  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude,
328  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
329  ) const;
330 
331  virtual void _findBins(
332  vector<vector<uInt64> >& binCounts,
333  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
334  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
335  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
336  const DataRanges& ranges, Bool isInclude,
337  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
338  ) const;
339 
340  virtual void _findBins(
341  vector<vector<uInt64> >& binCounts,
342  vector<CountedPtr<AccumType> >& sameVal, vector<Bool>& allSame,
343  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
344  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
345  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, const vector<AccumType>& maxLimit
346  ) const;
347  // </group>
348 
349  Bool _getDoMaxMin() const { return _doMaxMin; }
350 
351  Int64 _getIDataset() const { return _idataset; }
352 
353  virtual StatsData<AccumType> _getInitialStats() const;
354 
355  AccumType _getStatistic(StatisticsData::STATS stat);
356 
358 
359  // retreive stats structure. Allows derived classes to maintain their own
360  // StatsData structs.
361  inline virtual StatsData<AccumType>& _getStatsData() { return _statsData; }
362 
363  inline virtual const StatsData<AccumType>& _getStatsData() const { return _statsData; }
364 
365  // <group>
366  virtual void _minMax(
368  const DataIterator& dataBegin, Int64 nr, uInt dataStride
369  ) const;
370 
371  virtual void _minMax(
373  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
374  const DataRanges& ranges, Bool isInclude
375  ) const;
376 
377  virtual void _minMax(
379  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
380  const MaskIterator& maskBegin, uInt maskStride
381  ) const;
382 
383  virtual void _minMax(
385  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
386  const MaskIterator& maskBegin, uInt maskStride, const DataRanges& ranges,
387  Bool isInclude
388  ) const;
389 
390  virtual void _minMax(
392  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
393  Int64 nr, uInt dataStride
394  ) const;
395 
396  virtual void _minMax(
398  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
399  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
400  ) const;
401 
402  virtual void _minMax(
404  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
405  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
406  const DataRanges& ranges, Bool isInclude
407  ) const;
408 
409  virtual void _minMax(
411  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
412  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
413  ) const;
414  // </group>
415 
416  //<group>
417  // populate an unsorted array with valid data.
418  // no weights, no mask, no ranges
419  virtual void _populateArray(
420  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr, uInt dataStride
421  ) const;
422 
423  // ranges
424  virtual void _populateArray(
425  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
426  uInt dataStride, const DataRanges& ranges, Bool isInclude
427  ) const;
428 
429  virtual void _populateArray(
430  vector<AccumType>& ary, const DataIterator& dataBegin,
431  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
432  uInt maskStride
433  ) const;
434 
435  // mask and ranges
436  virtual void _populateArray(
437  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
438  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
439  const DataRanges& ranges, Bool isInclude
440  ) const;
441 
442  // weights
443  virtual void _populateArray(
444  vector<AccumType>& ary, const DataIterator& dataBegin,
445  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride
446  ) const;
447 
448  // weights and ranges
449  virtual void _populateArray(
450  vector<AccumType>& ary, const DataIterator& dataBegin,
451  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
452  const DataRanges& ranges, Bool isInclude
453  ) const;
454 
455  // weights and mask
456  virtual void _populateArray(
457  vector<AccumType>& ary, const DataIterator& dataBegin,
458  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
459  const MaskIterator& maskBegin, uInt maskStride
460  ) const;
461 
462  // weights, mask, ranges
463  virtual void _populateArray(
464  vector<AccumType>& ary, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
465  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
466  const DataRanges& ranges, Bool isInclude
467  ) const;
468  // </group>
469 
470  // <group>
471  // Create a vector of unsorted arrays, one array for each bin defined by <src>includeLimits</src>.
472  // <src>includeLimits</src> should be non-overlapping and should be given in ascending order (the
473  // algorithm used assumes this). Once the sum of the lengths of all arrays equals <src>maxCount</src>
474  // the method will return with no further processing.
475  // no weights, no mask, no ranges
476  virtual void _populateArrays(
477  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, Int64 nr, uInt dataStride,
478  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
479  ) const;
480 
481  // ranges
482  virtual void _populateArrays(
483  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, Int64 nr,
484  uInt dataStride, const DataRanges& ranges, Bool isInclude,
485  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
486  ) const;
487 
488  virtual void _populateArrays(
489  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
490  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
491  uInt maskStride,
492  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
493  ) const;
494 
495  // mask and ranges
496  virtual void _populateArrays(
497  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, Int64 nr,
498  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
499  const DataRanges& ranges, Bool isInclude,
500  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
501  ) const;
502 
503  // weights
504  virtual void _populateArrays(
505  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
506  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
507  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
508  ) const;
509 
510  // weights and ranges
511  virtual void _populateArrays(
512  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
513  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
514  const DataRanges& ranges, Bool isInclude,
515  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
516  ) const;
517 
518  // weights and mask
519  virtual void _populateArrays(
520  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin,
521  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
522  const MaskIterator& maskBegin, uInt maskStride,
523  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
524  ) const;
525 
526  // weights, mask, ranges
527  virtual void _populateArrays(
528  vector<vector<AccumType> >& arys, uInt64& currentCount, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
529  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
530  const DataRanges& ranges, Bool isInclude,
531  const vector<std::pair<AccumType, AccumType> > &includeLimits, uInt64 maxCount
532  ) const;
533  // </group>
534 
535  // <group>
536  // no weights, no mask, no ranges
537  virtual Bool _populateTestArray(
538  vector<AccumType>& ary, const DataIterator& dataBegin,
539  Int64 nr, uInt dataStride, uInt maxElements
540  ) const;
541 
542  // ranges
543  virtual Bool _populateTestArray(
544  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
545  uInt dataStride, const DataRanges& ranges, Bool isInclude,
546  uInt maxElements
547  ) const;
548 
549  // mask
550  virtual Bool _populateTestArray(
551  vector<AccumType>& ary, const DataIterator& dataBegin,
552  Int64 nr, uInt dataStride, const MaskIterator& maskBegin,
553  uInt maskStride, uInt maxElements
554  ) const;
555 
556  // mask and ranges
557  virtual Bool _populateTestArray(
558  vector<AccumType>& ary, const DataIterator& dataBegin, Int64 nr,
559  uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
560  const DataRanges& ranges, Bool isInclude, uInt maxElements
561  ) const;
562 
563  // weights
564  virtual Bool _populateTestArray(
565  vector<AccumType>& ary, const DataIterator& dataBegin,
566  const WeightsIterator& weightBegin, Int64 nr, uInt dataStride,
567  uInt maxElements
568  ) const;
569 
570  // weights and ranges
571  virtual Bool _populateTestArray(
572  vector<AccumType>& ary, const DataIterator& dataBegin,
573  const WeightsIterator& weightsBegin, Int64 nr, uInt dataStride,
574  const DataRanges& ranges, Bool isInclude, uInt maxElements
575  ) const;
576 
577  // weights and mask
578  virtual Bool _populateTestArray(
579  vector<AccumType>& ary, const DataIterator& dataBegin,
580  const WeightsIterator& weightBegin, Int64 nr,
581  uInt dataStride, const MaskIterator& maskBegin,
582  uInt maskStride, uInt maxElements
583  ) const;
584 
585  // weights, mask, ranges
586  virtual Bool _populateTestArray(
587  vector<AccumType>& ary, const DataIterator& dataBegin, const WeightsIterator& weightBegin,
588  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
589  const DataRanges& ranges, Bool isInclude,
590  uInt maxElements
591  ) const;
592  // </group>
593 
594  // <group>
595  // no weights, no mask, no ranges
596  virtual void _unweightedStats(
597  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
598  const DataIterator& dataBegin, Int64 nr, uInt dataStride
599  );
600 
601  // no weights, no mask
602  virtual void _unweightedStats(
603  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
604  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
605  const DataRanges& ranges, Bool isInclude
606  );
607 
608  virtual void _unweightedStats(
609  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
610  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
611  const MaskIterator& maskBegin, uInt maskStride
612  );
613 
614  virtual void _unweightedStats(
615  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
616  const DataIterator& dataBegin, Int64 nr, uInt dataStride,
617  const MaskIterator& maskBegin, uInt maskStride,
618  const DataRanges& ranges, Bool isInclude
619  );
620 
621  // </group>
622  virtual void _updateDataProviderMaxMin(
623  const StatsData<AccumType>& threadStats
624  );
625 
626  // <group>
627  // has weights, but no mask, no ranges
628  virtual void _weightedStats(
629  StatsData<AccumType>& stats, LocationType& location,
630  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
631  Int64 nr, uInt dataStride
632  );
633 
634  virtual void _weightedStats(
635  StatsData<AccumType>& stats, LocationType& location,
636  const DataIterator& dataBegin, const WeightsIterator& weightsBegin,
637  Int64 nr, uInt dataStride, const DataRanges& ranges, Bool isInclude
638  );
639 
640  virtual void _weightedStats(
641  StatsData<AccumType>& stats, LocationType& location,
642  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
643  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride
644  );
645 
646  virtual void _weightedStats(
647  StatsData<AccumType>& stats, LocationType& location,
648  const DataIterator& dataBegin, const WeightsIterator& weightBegin,
649  Int64 nr, uInt dataStride, const MaskIterator& maskBegin, uInt maskStride,
650  const DataRanges& ranges, Bool isInclude
651  );
652  // </group>
653 
654 private:
658  _hasData;
659 
660  // mutables, used to mitigate repeated code
661  mutable typename vector<DataIterator>::const_iterator _dend, _diter;
662  mutable vector<Int64>::const_iterator _citer;
663  mutable vector<uInt>::const_iterator _dsiter;
664  mutable std::map<uInt, MaskIterator> _masks;
665  mutable uInt _maskStride;
666  mutable std::map<uInt, WeightsIterator> _weights;
667  mutable std::map<uInt, DataRanges> _ranges;
668  mutable std::map<uInt, Bool> _isIncludeRanges;
671  mutable MaskIterator _myMask;
672  mutable DataIterator _myData;
673  mutable WeightsIterator _myWeights;
675  mutable uInt64 _myCount;
676 
677  static const uInt CACHE_PADDING;
678  static const uInt BLOCK_SIZE;
679 
680  // tally the number of data points that fall into each bin provided by <src>binDesc</src>
681  // Any points that are less than binDesc.minLimit or greater than
682  // binDesc.minLimit + binDesc.nBins*binDesc.binWidth are not included in the counts. A data
683  // point that falls exactly on a bin boundary is considered to be in the higher index bin.
684  // <src>sameVal</src> will be non-null if all the good values in the histogram range are the
685  // same. In that case, the value held will be the value of each of those data points.
686  vector<vector<uInt64> > _binCounts(
687  vector<CountedPtr<AccumType> >& sameVal,
688  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc
689  );
690 
691  void _computeBins(
692  vector<vector<uInt64> >& bins, vector<CountedPtr<AccumType> >& sameVal,
693  vector<Bool>& allSame, DataIterator dataIter, MaskIterator maskIter,
694  WeightsIterator weightsIter, uInt64 count,
695  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc,
696  const vector<AccumType>& maxLimit
697  );
698 
699  void _computeDataArray(
700  vector<AccumType>& ary, DataIterator dataIter,
701  MaskIterator maskIter, WeightsIterator weightsIter,
702  uInt64 dataCount
703  );
704 
705  void _computeDataArrays(
706  vector<vector<AccumType> >& arys, uInt64& currentCount,
707  DataIterator dataIter, MaskIterator maskIter,
708  WeightsIterator weightsIter, uInt64 dataCount,
709  const vector<std::pair<AccumType, AccumType> >& includeLimits,
710  uInt64 maxCount
711  );
712 
713  void _computeMinMax(
715  DataIterator dataIter, MaskIterator maskIter,
716  WeightsIterator weightsIter, uInt64 dataCount
717  );
718 
719  void _computeStats(
720  StatsData<AccumType>& stats, uInt64& ngood, LocationType& location,
721  DataIterator dataIter, MaskIterator maskIter,
722  WeightsIterator weightsIter, uInt64 count
723  );
724 
725  // convert in place by taking the absolute value of the difference of the vector and the median
726  static void _convertToAbsDevMedArray(vector<AccumType>& myArray, AccumType median);
727 
728  // Create an unsorted array of the complete data set. If <src>includeLimits</src> is specified,
729  // only points within those limits (including min but excluding max, as per definition of bins),
730  // are included.
731  void _createDataArray(
732  vector<AccumType>& array
733  );
734 
735  void _createDataArrays(
736  vector<vector<AccumType> >& arrays,
737  const vector<std::pair<AccumType, AccumType> > &includeLimits,
738  uInt64 maxCount
739  );
740  // extract data from multiple histograms given by <src>binDesc</src>.
741  // <src>dataIndices</src> represent the indices of the sorted arrays of values to
742  // extract. There should be exactly one set of data indices to extract for each
743  // supplied histogram. The data indices are relative to the minimum value of the minimum
744  // bin in their repsective histograms. The ordering of the maps in the returned vector represent
745  // the ordering of histograms in <src>binDesc</src>. <src>binDesc</src> should contain
746  // non-overlapping histograms and the histograms should be specified in ascending order.
747  vector<std::map<uInt64, AccumType> > _dataFromMultipleBins(
748  const vector<typename StatisticsUtilities<AccumType>::BinDesc>& binDesc, uInt64 maxArraySize,
749  const vector<std::set<uInt64> >& dataIndices, uInt64 nBins
750  );
751 
752  vector<std::map<uInt64, AccumType> > _dataFromSingleBins(
753  const vector<uInt64>& binNpts, uInt64 maxArraySize,
754  const vector<std::pair<AccumType, AccumType> >& binLimits,
755  const vector<std::set<uInt64> >& dataIndices, uInt64 nBins
756  );
757 
758  Int64 _doNpts();
759 
760  // increment the relevant loop counters
761  Bool _increment(Bool includeIDataset);
762 
763  // increment thread-based iterators
765  DataIterator& dataIter, MaskIterator& maskIter,
766  WeightsIterator& weightsIter, uInt64& offset, uInt nthreads
767  ) const;
768 
769  // get the values for the specified indices in the sorted array of all good data
770  std::map<uInt64, AccumType> _indicesToValues(
771  CountedPtr<uInt64> knownNpts, CountedPtr<AccumType> knownMin,
772  CountedPtr<AccumType> knownMax, uInt64 maxArraySize,
773  const std::set<uInt64>& dataIndices, Bool persistSortedArray,
774  uInt64 nBins
775  );
776 
777  void _initIterators();
778 
779  void _initLoopVars();
780 
781  void _initThreadVars(
782  uInt& nBlocks, uInt64& extra, uInt& nthreads, PtrHolder<DataIterator>& dataIter,
783  PtrHolder<MaskIterator>& maskIter, PtrHolder<WeightsIterator>& weightsIter,
784  PtrHolder<uInt64>& offset, uInt nThreadsMax
785  ) const;
786 
787  // Determine by scanning the dataset if the number of good points is smaller than
788  // <src>maxArraySize</src>. If so, <src>arrayToSort</src> will contain the unsorted
789  // data values. If not, this vector will be empty.
790  Bool _isNptsSmallerThan(vector<AccumType>& arrayToSort, uInt maxArraySize);
791 
792  // If <src>allowPad</src> is True, then pad the lower side of the lowest bin and the
793  // higher side of the highest bin so that minData and maxData do not fall on the edge
794  // of their respective bins. If false, no padding so that minData and maxData are also
795  // exactly the histogram abscissa limits.
796  static void _makeBins(
797  typename StatisticsUtilities<AccumType>::BinDesc& bins, AccumType minData, AccumType maxData, uInt maxBins,
798  Bool allowPad
799  );
800 
801  static void _mergeResults(
802  vector<vector<uInt64> >& bins, vector<CountedPtr<AccumType> >& sameVal,
803  vector<Bool>& allSame, const PtrHolder<vector<vector<uInt64> > >& tBins,
804  const PtrHolder<vector<CountedPtr<AccumType> > >& tSameVal,
805  const PtrHolder<vector<Bool> >& tAllSame, uInt nThreadsMax
806  );
807 
808  // get the index (for odd npts) or indices (for even npts) of the median of the sorted array.
809  // If knownNpts is not null, it will be used and must be correct. If it is null, the value of
810  // _npts will be used if it has been previously calculated. If not, the data sets will
811  // be scanned to determine npts.
812  std::set<uInt64> _medianIndices(CountedPtr<uInt64> knownNpts);
813 
814  uInt _nThreadsMax() const;
815 
816  uInt _threadIdx() const;
817 
818  // get values from sorted array if the array is small enough to be held in
819  // memory. Note that this is the array containing all good data, not data in
820  // just a single bin representing a subset of good data.
821  // Returns True if the data were successfully retrieved.
822  // If True is returned, the values map will contain a map of index to value.
824  std::map<uInt64, AccumType>& values, CountedPtr<uInt64> knownNpts,
825  const std::set<uInt64>& indices, uInt64 maxArraySize,
826  Bool persistSortedArray
827  );
828 };
829 
830 }
831 
832 #ifndef CASACORE_NO_AUTO_TEMPLATES
833 #include <casacore/scimath/Mathematics/ClassicalStatistics.tcc>
834 #endif //# CASACORE_NO_AUTO_TEMPLATES
835 
836 #endif
void _doMinMax(AccumType &vmin, AccumType &vmax)
scan dataset(s) to find min and max
vector< std::map< uInt64, AccumType > > _dataFromMultipleBins(const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, uInt64 maxArraySize, const vector< std::set< uInt64 > > &dataIndices, uInt64 nBins)
extract data from multiple histograms given by binDesc.
Bool _valuesFromSortedArray(std::map< uInt64, AccumType > &values, CountedPtr< uInt64 > knownNpts, const std::set< uInt64 > &indices, uInt64 maxArraySize, Bool persistSortedArray)
get values from sorted array if the array is small enough to be held in memory.
vector< DataIterator >::const_iterator _dend
mutables, used to mitigate repeated code
long long Int64
Define the extra non-standard types used by Casacore (like proposed uSize, Size)
Definition: aipsxtype.h:38
void _computeMinMax(CountedPtr< AccumType > &mymax, CountedPtr< AccumType > &mymin, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 dataCount)
LatticeExprNode median(const LatticeExprNode &expr)
AccumType _getStatistic(StatisticsData::STATS stat)
ClassicalStatistics< AccumType, DataIterator, MaskIterator, WeightsIterator > & operator=(const ClassicalStatistics< AccumType, DataIterator, MaskIterator, WeightsIterator > &other)
copy semantics
vector< DataIterator >::const_iterator _diter
virtual void _minMax(CountedPtr< AccumType > &mymin, CountedPtr< AccumType > &mymax, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
StatsData< AccumType > _getStatistics()
virtual StatsData< AccumType > & _getStatsData()
retreive stats structure.
TableExprNode array(const TableExprNode &values, const TableExprNodeSet &shape)
Create an array of the given shape and fill it with the values.
Definition: ExprNode.h:2093
void _createDataArray(vector< AccumType > &array)
Create an unsorted array of the complete data set.
unsigned long long uInt64
Definition: aipsxtype.h:39
std::set< uInt64 > _medianIndices(CountedPtr< uInt64 > knownNpts)
get the index (for odd npts) or indices (for even npts) of the median of the sorted array...
PtrHolder(const PtrHolder< T > &other)
virtual std::pair< Int64, Int64 > getStatisticIndex(StatisticsData::STATS stat)
see base class description
std::map< uInt64, AccumType > _indicesToValues(CountedPtr< uInt64 > knownNpts, CountedPtr< AccumType > knownMin, CountedPtr< AccumType > knownMax, uInt64 maxArraySize, const std::set< uInt64 > &dataIndices, Bool persistSortedArray, uInt64 nBins)
get the values for the specified indices in the sorted array of all good data
std::pair< Int64, Int64 > LocationType
std::map< uInt, DataRanges > _ranges
void setStatsToCalculate(std::set< StatisticsData::STATS > &stats)
Provide guidance to algorithms by specifying a priori which statistics the caller would like calculat...
Abstract base class which defines interface for providing "datasets" to the statistics framework when...
Class to calculate statistics in a "classical" sense, ie using accumulators with no special filtering...
void _computeBins(vector< vector< uInt64 > > &bins, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 count, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, const vector< AccumType > &maxLimit)
virtual uInt64 getNPts()
scan the dataset(s) that have been added, and find the number of good points.
Hold and delete pointers not deleted by object destructors.
Definition: PtrHolder.h:81
virtual void getMinMax(AccumType &mymin, AccumType &mymax)
scan the dataset(s) that have been added, and find the min and max.
vector< vector< uInt64 > > _binCounts(vector< CountedPtr< AccumType > > &sameVal, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc)
tally the number of data points that fall into each bin provided by binDesc Any points that are less ...
std::map< uInt, WeightsIterator > _weights
void _computeDataArray(vector< AccumType > &ary, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 dataCount)
ALGORITHM
implemented algorithms
virtual const StatsData< AccumType > & _getStatsData() const
virtual void _populateArrays(vector< vector< AccumType > > &arys, uInt64 &currentCount, const DataIterator &dataBegin, Int64 nr, uInt dataStride, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt64 maxCount) const
Create a vector of unsorted arrays, one array for each bin defined by includeLimits.
void _addData()
Allows derived classes to do things after data is set or added.
Referenced counted pointer for constant data.
Definition: CountedPtr.h:86
virtual void _accumNpts(uInt64 &npts, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
scan through the data set to determine the number of good (unmasked, weight > 0, within range) points...
virtual AccumType getMedianAbsDevMed(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
get the median of the absolute deviation about the median of the data.
std::map< uInt, Bool > _isIncludeRanges
Bool hasData() const
Has any data been added to this object? Will return False if the object has been reset and no data ha...
void _accumulate(StatsData< AccumType > &stats, const AccumType &datum, const LocationType &location)
virtual void _findBins(vector< vector< uInt64 > > &binCounts, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, const DataIterator &dataBegin, Int64 nr, uInt dataStride, const vector< typename StatisticsUtilities< AccumType >::BinDesc > &binDesc, const vector< AccumType > &maxLimit) const
Get the counts of data within the specified histogram bins.
static void _convertToAbsDevMedArray(vector< AccumType > &myArray, AccumType median)
convert in place by taking the absolute value of the difference of the vector and the median ...
void _computeDataArrays(vector< vector< AccumType > > &arys, uInt64 &currentCount, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 dataCount, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt64 maxCount)
void _initThreadVars(uInt &nBlocks, uInt64 &extra, uInt &nthreads, PtrHolder< DataIterator > &dataIter, PtrHolder< MaskIterator > &maskIter, PtrHolder< WeightsIterator > &weightsIter, PtrHolder< uInt64 > &offset, uInt nThreadsMax) const
#define DataRanges
Commonly used types in statistics framework.
bool Bool
Define the standard types used by Casacore.
Definition: aipstype.h:42
virtual Bool _populateTestArray(vector< AccumType > &ary, const DataIterator &dataBegin, Int64 nr, uInt dataStride, uInt maxElements) const
no weights, no mask, no ranges
virtual void setCalculateAsAdded(Bool c)
Should statistics be updated with calls to addData or should they only be calculated upon calls to ge...
void setDataProvider(StatsDataProvider< AccumType, DataIterator, MaskIterator, WeightsIterator > *dataProvider)
An exception will be thrown if setCalculateAsAdded(True) has been called.
virtual AccumType getMedianAndQuantiles(std::map< Double, AccumType > &quantiles, const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
If one needs to compute both the median and quantile values, it is better to call getMedianAndQuantil...
virtual std::map< Double, AccumType > getQuantiles(const std::set< Double > &fractions, CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
Get the specified quantiles.
std::map< uInt, MaskIterator > _masks
void _createDataArrays(vector< vector< AccumType > > &arrays, const vector< std::pair< AccumType, AccumType > > &includeLimits, uInt64 maxCount)
static void _makeBins(typename StatisticsUtilities< AccumType >::BinDesc &bins, AccumType minData, AccumType maxData, uInt maxBins, Bool allowPad)
If allowPad is True, then pad the lower side of the lowest bin and the higher side of the highest bin...
vector< uInt >::const_iterator _dsiter
const Bool False
Definition: aipstype.h:44
virtual void _unweightedStats(StatsData< AccumType > &stats, uInt64 &ngood, LocationType &location, const DataIterator &dataBegin, Int64 nr, uInt dataStride)
no weights, no mask, no ranges
StatsData< AccumType > _statsData
vector< std::map< uInt64, AccumType > > _dataFromSingleBins(const vector< uInt64 > &binNpts, uInt64 maxArraySize, const vector< std::pair< AccumType, AccumType > > &binLimits, const vector< std::set< uInt64 > > &dataIndices, uInt64 nBins)
Bool _increment(Bool includeIDataset)
increment the relevant loop counters
virtual void reset()
reset object to initial state.
virtual StatisticsData::ALGORITHM algorithm() const
get the algorithm that this object uses for computing stats
vector< Int64 >::const_iterator _citer
const Double c
Fundamental physical constants (SI units):
virtual AccumType getMedian(CountedPtr< uInt64 > knownNpts=NULL, CountedPtr< AccumType > knownMin=NULL, CountedPtr< AccumType > knownMax=NULL, uInt binningThreshholdSizeBytes=4096 *4096, Bool persistSortedArray=False, uInt64 nBins=10000)
In the following group of methods, if the size of the composite dataset is smaller than binningThresh...
static void _mergeResults(vector< vector< uInt64 > > &bins, vector< CountedPtr< AccumType > > &sameVal, vector< Bool > &allSame, const PtrHolder< vector< vector< uInt64 > > > &tBins, const PtrHolder< vector< CountedPtr< AccumType > > > &tSameVal, const PtrHolder< vector< Bool > > &tAllSame, uInt nThreadsMax)
virtual void _weightedStats(StatsData< AccumType > &stats, LocationType &location, const DataIterator &dataBegin, const WeightsIterator &weightsBegin, Int64 nr, uInt dataStride)
has weights, but no mask, no ranges
void _incrementThreadIters(DataIterator &dataIter, MaskIterator &maskIter, WeightsIterator &weightsIter, uInt64 &offset, uInt nthreads) const
increment thread-based iterators
Bool _isNptsSmallerThan(vector< AccumType > &arrayToSort, uInt maxArraySize)
Determine by scanning the dataset if the number of good points is smaller than maxArraySize.
virtual void _updateDataProviderMaxMin(const StatsData< AccumType > &threadStats)
virtual void _populateArray(vector< AccumType > &ary, const DataIterator &dataBegin, Int64 nr, uInt dataStride) const
populate an unsorted array with valid data.
virtual StatsData< AccumType > _getInitialStats() const
void _computeStats(StatsData< AccumType > &stats, uInt64 &ngood, LocationType &location, DataIterator dataIter, MaskIterator maskIter, WeightsIterator weightsIter, uInt64 count)
Base class of statistics algorithm class hierarchy.
this file contains all the compiler specific defines
Definition: mainpage.dox:28
unsigned int uInt
Definition: aipstype.h:51
description of a regularly spaced bins with the first bin having lower limit of minLimit and having n...