2007-03-08 04:03:40 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* File: statistc.c (Formerly stats.c)
|
|
|
|
* Description: Simple statistical package for integer values.
|
2016-11-08 02:46:33 +08:00
|
|
|
* Author: Ray Smith
|
|
|
|
* Created: Mon Feb 04 16:56:05 GMT 1991
|
2007-03-08 04:03:40 +08:00
|
|
|
*
|
|
|
|
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
2012-09-22 08:59:31 +08:00
|
|
|
// Include automatically generated configuration file if running autoconf.
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
|
|
#include "config_auto.h"
|
|
|
|
#endif
|
|
|
|
|
2011-03-22 05:44:45 +08:00
|
|
|
#include "statistc.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
#include <string.h>
|
|
|
|
#include <math.h>
|
|
|
|
#include <stdlib.h>
|
2011-03-22 05:44:45 +08:00
|
|
|
#include "helpers.h"
|
|
|
|
#include "scrollview.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "tprintf.h"
|
|
|
|
|
2013-09-21 03:43:47 +08:00
|
|
|
using tesseract::KDPairInc;
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* STATS::STATS
|
|
|
|
*
|
|
|
|
* Construct a new stats element by allocating and zeroing the memory.
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
STATS::STATS(inT32 min_bucket_value, inT32 max_bucket_value_plus_1) {
|
|
|
|
if (max_bucket_value_plus_1 <= min_bucket_value) {
|
|
|
|
min_bucket_value = 0;
|
|
|
|
max_bucket_value_plus_1 = 1;
|
|
|
|
}
|
|
|
|
rangemin_ = min_bucket_value; // setup
|
|
|
|
rangemax_ = max_bucket_value_plus_1;
|
|
|
|
buckets_ = new inT32[rangemax_ - rangemin_];
|
|
|
|
clear();
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
2011-03-22 05:44:45 +08:00
|
|
|
STATS::STATS() {
|
|
|
|
rangemax_ = 0;
|
|
|
|
rangemin_ = 0;
|
|
|
|
buckets_ = NULL;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::set_range
|
|
|
|
*
|
|
|
|
* Alter the range on an existing stats element.
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
bool STATS::set_range(inT32 min_bucket_value, inT32 max_bucket_value_plus_1) {
|
|
|
|
if (max_bucket_value_plus_1 <= min_bucket_value) {
|
2007-03-08 04:03:40 +08:00
|
|
|
return false;
|
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
if (rangemax_ - rangemin_ != max_bucket_value_plus_1 - min_bucket_value) {
|
|
|
|
delete [] buckets_;
|
|
|
|
buckets_ = new inT32[max_bucket_value_plus_1 - min_bucket_value];
|
|
|
|
}
|
|
|
|
rangemin_ = min_bucket_value; // setup
|
|
|
|
rangemax_ = max_bucket_value_plus_1;
|
|
|
|
clear(); // zero it
|
2007-03-08 04:03:40 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::clear
|
|
|
|
*
|
|
|
|
* Clear out the STATS class by zeroing all the buckets.
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
void STATS::clear() { // clear out buckets
|
|
|
|
total_count_ = 0;
|
|
|
|
if (buckets_ != NULL)
|
|
|
|
memset(buckets_, 0, (rangemax_ - rangemin_) * sizeof(buckets_[0]));
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::~STATS
|
|
|
|
*
|
|
|
|
* Destructor for a stats class.
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
STATS::~STATS () {
|
2017-04-30 20:39:19 +08:00
|
|
|
delete [] buckets_;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::add
|
|
|
|
*
|
|
|
|
* Add a set of samples to (or delete from) a pile.
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
void STATS::add(inT32 value, inT32 count) {
|
|
|
|
if (buckets_ == NULL) {
|
2007-03-08 04:03:40 +08:00
|
|
|
return;
|
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
value = ClipToRange(value, rangemin_, rangemax_ - 1);
|
|
|
|
buckets_[value - rangemin_] += count;
|
|
|
|
total_count_ += count; // keep count of total
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::mode
|
|
|
|
*
|
|
|
|
* Find the mode of a stats class.
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
inT32 STATS::mode() const { // get mode of samples
|
|
|
|
if (buckets_ == NULL) {
|
|
|
|
return rangemin_;
|
|
|
|
}
|
|
|
|
inT32 max = buckets_[0]; // max cell count
|
|
|
|
inT32 maxindex = 0; // index of max
|
|
|
|
for (int index = rangemax_ - rangemin_ - 1; index > 0; --index) {
|
|
|
|
if (buckets_[index] > max) {
|
|
|
|
max = buckets_[index]; // find biggest
|
2007-03-08 04:03:40 +08:00
|
|
|
maxindex = index;
|
|
|
|
}
|
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
return maxindex + rangemin_; // index of biggest
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::mean
|
|
|
|
*
|
|
|
|
* Find the mean of a stats class.
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
double STATS::mean() const { //get mean of samples
|
|
|
|
if (buckets_ == NULL || total_count_ <= 0) {
|
|
|
|
return static_cast<double>(rangemin_);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
inT64 sum = 0;
|
|
|
|
for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {
|
|
|
|
sum += static_cast<inT64>(index) * buckets_[index];
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
return static_cast<double>(sum) / total_count_ + rangemin_;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::sd
|
|
|
|
*
|
|
|
|
* Find the standard deviation of a stats class.
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
double STATS::sd() const { //standard deviation
|
|
|
|
if (buckets_ == NULL || total_count_ <= 0) {
|
|
|
|
return 0.0;
|
|
|
|
}
|
|
|
|
inT64 sum = 0;
|
|
|
|
double sqsum = 0.0;
|
|
|
|
for (int index = rangemax_ - rangemin_ - 1; index >= 0; --index) {
|
|
|
|
sum += static_cast<inT64>(index) * buckets_[index];
|
|
|
|
sqsum += static_cast<double>(index) * index * buckets_[index];
|
|
|
|
}
|
|
|
|
double variance = static_cast<double>(sum) / total_count_;
|
|
|
|
variance = sqsum / total_count_ - variance * variance;
|
|
|
|
if (variance > 0.0)
|
|
|
|
return sqrt(variance);
|
|
|
|
return 0.0;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2011-03-22 05:44:45 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* STATS::ile
|
|
|
|
*
|
|
|
|
* Returns the fractile value such that frac fraction (in [0,1]) of samples
|
|
|
|
* has a value less than the return value.
|
|
|
|
**********************************************************************/
|
|
|
|
double STATS::ile(double frac) const {
|
|
|
|
if (buckets_ == NULL || total_count_ == 0) {
|
|
|
|
return static_cast<double>(rangemin_);
|
|
|
|
}
|
|
|
|
#if 0
|
|
|
|
// TODO(rays) The existing code doesn't seem to be doing the right thing
|
|
|
|
// with target a double but this substitute crashes the code that uses it.
|
|
|
|
// Investigate and fix properly.
|
|
|
|
int target = IntCastRounded(frac * total_count_);
|
|
|
|
target = ClipToRange(target, 1, total_count_);
|
|
|
|
#else
|
|
|
|
double target = frac * total_count_;
|
|
|
|
target = ClipToRange(target, 1.0, static_cast<double>(total_count_));
|
|
|
|
#endif
|
|
|
|
int sum = 0;
|
|
|
|
int index = 0;
|
|
|
|
for (index = 0; index < rangemax_ - rangemin_ && sum < target;
|
|
|
|
sum += buckets_[index++]);
|
|
|
|
if (index > 0) {
|
|
|
|
ASSERT_HOST(buckets_[index - 1] > 0);
|
|
|
|
return rangemin_ + index -
|
|
|
|
static_cast<double>(sum - target) / buckets_[index - 1];
|
|
|
|
} else {
|
|
|
|
return static_cast<double>(rangemin_);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::min_bucket
|
|
|
|
*
|
2015-09-15 04:02:00 +08:00
|
|
|
* Find REAL minimum bucket - ile(0.0) isn't necessarily correct
|
2011-03-22 05:44:45 +08:00
|
|
|
**********************************************************************/
|
|
|
|
inT32 STATS::min_bucket() const { // Find min
|
|
|
|
if (buckets_ == NULL || total_count_ == 0) {
|
|
|
|
return rangemin_;
|
|
|
|
}
|
|
|
|
inT32 min = 0;
|
|
|
|
for (min = 0; (min < rangemax_ - rangemin_) && (buckets_[min] == 0); min++);
|
|
|
|
return rangemin_ + min;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
2011-03-22 05:44:45 +08:00
|
|
|
* STATS::max_bucket
|
2007-03-08 04:03:40 +08:00
|
|
|
*
|
2015-09-15 04:02:00 +08:00
|
|
|
* Find REAL maximum bucket - ile(1.0) isn't necessarily correct
|
2007-03-08 04:03:40 +08:00
|
|
|
**********************************************************************/
|
|
|
|
|
2011-03-22 05:44:45 +08:00
|
|
|
inT32 STATS::max_bucket() const { // Find max
|
|
|
|
if (buckets_ == NULL || total_count_ == 0) {
|
|
|
|
return rangemin_;
|
|
|
|
}
|
|
|
|
inT32 max;
|
|
|
|
for (max = rangemax_ - rangemin_ - 1; max > 0 && buckets_[max] == 0; max--);
|
|
|
|
return rangemin_ + max;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::median
|
|
|
|
*
|
2011-03-22 05:44:45 +08:00
|
|
|
* Finds a more useful estimate of median than ile(0.5).
|
2007-03-08 04:03:40 +08:00
|
|
|
*
|
|
|
|
* Overcomes a problem with ile() - if the samples are, for example,
|
|
|
|
* 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
|
|
|
|
* between 6 and 13 = 9.5
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
double STATS::median() const { //get median
|
|
|
|
if (buckets_ == NULL) {
|
|
|
|
return static_cast<double>(rangemin_);
|
|
|
|
}
|
|
|
|
double median = ile(0.5);
|
|
|
|
int median_pile = static_cast<int>(floor(median));
|
|
|
|
if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {
|
|
|
|
inT32 min_pile;
|
|
|
|
inT32 max_pile;
|
2015-09-15 04:02:00 +08:00
|
|
|
/* Find preceding non zero pile */
|
2011-03-22 05:44:45 +08:00
|
|
|
for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--);
|
2007-03-08 04:03:40 +08:00
|
|
|
/* Find following non zero pile */
|
2011-03-22 05:44:45 +08:00
|
|
|
for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++);
|
|
|
|
median = (min_pile + max_pile) / 2.0;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
return median;
|
|
|
|
}
|
|
|
|
|
2011-03-22 05:44:45 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* STATS::local_min
|
|
|
|
*
|
|
|
|
* Return TRUE if this point is a local min.
|
|
|
|
**********************************************************************/
|
|
|
|
bool STATS::local_min(inT32 x) const {
|
|
|
|
if (buckets_ == NULL) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
x = ClipToRange(x, rangemin_, rangemax_ - 1) - rangemin_;
|
|
|
|
if (buckets_[x] == 0)
|
|
|
|
return true;
|
|
|
|
inT32 index; // table index
|
|
|
|
for (index = x - 1; index >= 0 && buckets_[index] == buckets_[x]; --index);
|
|
|
|
if (index >= 0 && buckets_[index] < buckets_[x])
|
|
|
|
return false;
|
|
|
|
for (index = x + 1; index < rangemax_ - rangemin_ &&
|
|
|
|
buckets_[index] == buckets_[x]; ++index);
|
|
|
|
if (index < rangemax_ - rangemin_ && buckets_[index] < buckets_[x])
|
|
|
|
return false;
|
|
|
|
else
|
|
|
|
return true;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::smooth
|
|
|
|
*
|
|
|
|
* Apply a triangular smoothing filter to the stats.
|
|
|
|
* This makes the modes a bit more useful.
|
|
|
|
* The factor gives the height of the triangle, i.e. the weight of the
|
|
|
|
* centre.
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
void STATS::smooth(inT32 factor) {
|
|
|
|
if (buckets_ == NULL || factor < 2) {
|
2007-03-08 04:03:40 +08:00
|
|
|
return;
|
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
STATS result(rangemin_, rangemax_);
|
|
|
|
int entrycount = rangemax_ - rangemin_;
|
|
|
|
for (int entry = 0; entry < entrycount; entry++) {
|
2007-03-08 04:03:40 +08:00
|
|
|
//centre weight
|
2011-03-22 05:44:45 +08:00
|
|
|
int count = buckets_[entry] * factor;
|
|
|
|
for (int offset = 1; offset < factor; offset++) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (entry - offset >= 0)
|
2011-03-22 05:44:45 +08:00
|
|
|
count += buckets_[entry - offset] * (factor - offset);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (entry + offset < entrycount)
|
2011-03-22 05:44:45 +08:00
|
|
|
count += buckets_[entry + offset] * (factor - offset);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
result.add(entry + rangemin_, count);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
total_count_ = result.total_count_;
|
|
|
|
memcpy(buckets_, result.buckets_, entrycount * sizeof(buckets_[0]));
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::cluster
|
|
|
|
*
|
|
|
|
* Cluster the samples into max_cluster clusters.
|
|
|
|
* Each call runs one iteration. The array of clusters must be
|
|
|
|
* max_clusters+1 in size as cluster 0 is used to indicate which samples
|
|
|
|
* have been used.
|
|
|
|
* The return value is the current number of clusters.
|
|
|
|
**********************************************************************/
|
|
|
|
|
2011-03-22 05:44:45 +08:00
|
|
|
inT32 STATS::cluster(float lower, // thresholds
|
2007-03-08 04:03:40 +08:00
|
|
|
float upper,
|
2011-03-22 05:44:45 +08:00
|
|
|
float multiple, // distance threshold
|
|
|
|
inT32 max_clusters, // max no to make
|
|
|
|
STATS *clusters) { // array of clusters
|
|
|
|
BOOL8 new_cluster; // added one
|
|
|
|
float *centres; // cluster centres
|
|
|
|
inT32 entry; // bucket index
|
|
|
|
inT32 cluster; // cluster index
|
|
|
|
inT32 best_cluster; // one to assign to
|
|
|
|
inT32 new_centre = 0; // residual mode
|
|
|
|
inT32 new_mode; // pile count of new_centre
|
|
|
|
inT32 count; // pile to place
|
|
|
|
float dist; // from cluster
|
|
|
|
float min_dist; // from best_cluster
|
|
|
|
inT32 cluster_count; // no of clusters
|
|
|
|
|
|
|
|
if (buckets_ == NULL || max_clusters < 1)
|
2007-03-08 04:03:40 +08:00
|
|
|
return 0;
|
2011-03-22 05:44:45 +08:00
|
|
|
centres = new float[max_clusters + 1];
|
2007-03-08 04:03:40 +08:00
|
|
|
for (cluster_count = 1; cluster_count <= max_clusters
|
2011-03-22 05:44:45 +08:00
|
|
|
&& clusters[cluster_count].buckets_ != NULL
|
|
|
|
&& clusters[cluster_count].total_count_ > 0;
|
|
|
|
cluster_count++) {
|
2007-03-08 04:03:40 +08:00
|
|
|
centres[cluster_count] =
|
2011-03-22 05:44:45 +08:00
|
|
|
static_cast<float>(clusters[cluster_count].ile(0.5));
|
|
|
|
new_centre = clusters[cluster_count].mode();
|
2007-03-08 04:03:40 +08:00
|
|
|
for (entry = new_centre - 1; centres[cluster_count] - entry < lower
|
2011-03-22 05:44:45 +08:00
|
|
|
&& entry >= rangemin_
|
|
|
|
&& pile_count(entry) <= pile_count(entry + 1);
|
|
|
|
entry--) {
|
|
|
|
count = pile_count(entry) - clusters[0].pile_count(entry);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (count > 0) {
|
2011-03-22 05:44:45 +08:00
|
|
|
clusters[cluster_count].add(entry, count);
|
2007-03-08 04:03:40 +08:00
|
|
|
clusters[0].add (entry, count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (entry = new_centre + 1; entry - centres[cluster_count] < lower
|
2011-03-22 05:44:45 +08:00
|
|
|
&& entry < rangemax_
|
|
|
|
&& pile_count(entry) <= pile_count(entry - 1);
|
|
|
|
entry++) {
|
|
|
|
count = pile_count(entry) - clusters[0].pile_count(entry);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (count > 0) {
|
2011-03-22 05:44:45 +08:00
|
|
|
clusters[cluster_count].add(entry, count);
|
|
|
|
clusters[0].add(entry, count);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cluster_count--;
|
|
|
|
|
|
|
|
if (cluster_count == 0) {
|
2011-03-22 05:44:45 +08:00
|
|
|
clusters[0].set_range(rangemin_, rangemax_);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
do {
|
|
|
|
new_cluster = FALSE;
|
|
|
|
new_mode = 0;
|
2011-03-22 05:44:45 +08:00
|
|
|
for (entry = 0; entry < rangemax_ - rangemin_; entry++) {
|
|
|
|
count = buckets_[entry] - clusters[0].buckets_[entry];
|
2007-03-08 04:03:40 +08:00
|
|
|
//remaining pile
|
|
|
|
if (count > 0) { //any to handle
|
2011-03-22 05:44:45 +08:00
|
|
|
min_dist = static_cast<float>(MAX_INT32);
|
2007-03-08 04:03:40 +08:00
|
|
|
best_cluster = 0;
|
|
|
|
for (cluster = 1; cluster <= cluster_count; cluster++) {
|
2011-03-22 05:44:45 +08:00
|
|
|
dist = entry + rangemin_ - centres[cluster];
|
2007-03-08 04:03:40 +08:00
|
|
|
//find distance
|
|
|
|
if (dist < 0)
|
|
|
|
dist = -dist;
|
|
|
|
if (dist < min_dist) {
|
|
|
|
min_dist = dist; //find least
|
|
|
|
best_cluster = cluster;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (min_dist > upper //far enough for new
|
|
|
|
&& (best_cluster == 0
|
2011-03-22 05:44:45 +08:00
|
|
|
|| entry + rangemin_ > centres[best_cluster] * multiple
|
|
|
|
|| entry + rangemin_ < centres[best_cluster] / multiple)) {
|
2007-03-08 04:03:40 +08:00
|
|
|
if (count > new_mode) {
|
|
|
|
new_mode = count;
|
2011-03-22 05:44:45 +08:00
|
|
|
new_centre = entry + rangemin_;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
// need new and room
|
2007-03-08 04:03:40 +08:00
|
|
|
if (new_mode > 0 && cluster_count < max_clusters) {
|
|
|
|
cluster_count++;
|
|
|
|
new_cluster = TRUE;
|
2014-04-24 09:02:45 +08:00
|
|
|
if (!clusters[cluster_count].set_range(rangemin_, rangemax_)) {
|
|
|
|
delete [] centres;
|
2007-03-08 04:03:40 +08:00
|
|
|
return 0;
|
2014-04-24 09:02:45 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
centres[cluster_count] = static_cast<float>(new_centre);
|
|
|
|
clusters[cluster_count].add(new_centre, new_mode);
|
|
|
|
clusters[0].add(new_centre, new_mode);
|
2007-03-08 04:03:40 +08:00
|
|
|
for (entry = new_centre - 1; centres[cluster_count] - entry < lower
|
2011-03-22 05:44:45 +08:00
|
|
|
&& entry >= rangemin_
|
|
|
|
&& pile_count (entry) <= pile_count(entry + 1); entry--) {
|
|
|
|
count = pile_count(entry) - clusters[0].pile_count(entry);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (count > 0) {
|
2011-03-22 05:44:45 +08:00
|
|
|
clusters[cluster_count].add(entry, count);
|
|
|
|
clusters[0].add(entry, count);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
for (entry = new_centre + 1; entry - centres[cluster_count] < lower
|
2011-03-22 05:44:45 +08:00
|
|
|
&& entry < rangemax_
|
|
|
|
&& pile_count (entry) <= pile_count(entry - 1); entry++) {
|
|
|
|
count = pile_count(entry) - clusters[0].pile_count(entry);
|
2007-03-08 04:03:40 +08:00
|
|
|
if (count > 0) {
|
2011-03-22 05:44:45 +08:00
|
|
|
clusters[cluster_count].add(entry, count);
|
2007-03-08 04:03:40 +08:00
|
|
|
clusters[0].add (entry, count);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
centres[cluster_count] =
|
2011-03-22 05:44:45 +08:00
|
|
|
static_cast<float>(clusters[cluster_count].ile(0.5));
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
} while (new_cluster && cluster_count < max_clusters);
|
|
|
|
delete [] centres;
|
2007-03-08 04:03:40 +08:00
|
|
|
return cluster_count;
|
|
|
|
}
|
|
|
|
|
2013-09-21 03:43:47 +08:00
|
|
|
// Helper tests that the current index is still part of the peak and gathers
|
|
|
|
// the data into the peak, returning false when the peak is ended.
|
|
|
|
// src_buckets[index] - used_buckets[index] is the unused part of the histogram.
|
|
|
|
// prev_count is the histogram count of the previous index on entry and is
|
|
|
|
// updated to the current index on return.
|
|
|
|
// total_count and total_value are accumulating the mean of the peak.
|
|
|
|
static bool GatherPeak(int index, const int* src_buckets, int* used_buckets,
|
|
|
|
int* prev_count, int* total_count, double* total_value) {
|
|
|
|
int pile_count = src_buckets[index] - used_buckets[index];
|
|
|
|
if (pile_count <= *prev_count && pile_count > 0) {
|
|
|
|
// Accumulate count and index.count product.
|
|
|
|
*total_count += pile_count;
|
|
|
|
*total_value += index * pile_count;
|
|
|
|
// Mark this index as used
|
|
|
|
used_buckets[index] = src_buckets[index];
|
|
|
|
*prev_count = pile_count;
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finds (at most) the top max_modes modes, well actually the whole peak around
|
|
|
|
// each mode, returning them in the given modes vector as a <mean of peak,
|
|
|
|
// total count of peak> pair in order of decreasing total count.
|
|
|
|
// Since the mean is the key and the count the data in the pair, a single call
|
|
|
|
// to sort on the output will re-sort by increasing mean of peak if that is
|
|
|
|
// more useful than decreasing total count.
|
|
|
|
// Returns the actual number of modes found.
|
|
|
|
int STATS::top_n_modes(int max_modes,
|
|
|
|
GenericVector<KDPairInc<float, int> >* modes) const {
|
|
|
|
if (max_modes <= 0) return 0;
|
|
|
|
int src_count = rangemax_ - rangemin_;
|
|
|
|
// Used copies the counts in buckets_ as they get used.
|
|
|
|
STATS used(rangemin_, rangemax_);
|
|
|
|
modes->truncate(0);
|
|
|
|
// Total count of the smallest peak found so far.
|
|
|
|
int least_count = 1;
|
|
|
|
// Mode that is used as a seed for each peak
|
|
|
|
int max_count = 0;
|
|
|
|
do {
|
|
|
|
// Find an unused mode.
|
|
|
|
max_count = 0;
|
|
|
|
int max_index = 0;
|
|
|
|
for (int src_index = 0; src_index < src_count; src_index++) {
|
|
|
|
int pile_count = buckets_[src_index] - used.buckets_[src_index];
|
|
|
|
if (pile_count > max_count) {
|
|
|
|
max_count = pile_count;
|
|
|
|
max_index = src_index;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (max_count > 0) {
|
|
|
|
// Copy the bucket count to used so it doesn't get found again.
|
|
|
|
used.buckets_[max_index] = max_count;
|
|
|
|
// Get the entire peak.
|
|
|
|
double total_value = max_index * max_count;
|
|
|
|
int total_count = max_count;
|
|
|
|
int prev_pile = max_count;
|
|
|
|
for (int offset = 1; max_index + offset < src_count; ++offset) {
|
|
|
|
if (!GatherPeak(max_index + offset, buckets_, used.buckets_,
|
|
|
|
&prev_pile, &total_count, &total_value))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
prev_pile = buckets_[max_index];
|
|
|
|
for (int offset = 1; max_index - offset >= 0; ++offset) {
|
|
|
|
if (!GatherPeak(max_index - offset, buckets_, used.buckets_,
|
|
|
|
&prev_pile, &total_count, &total_value))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (total_count > least_count || modes->size() < max_modes) {
|
|
|
|
// We definitely want this mode, so if we have enough discard the least.
|
|
|
|
if (modes->size() == max_modes)
|
|
|
|
modes->truncate(max_modes - 1);
|
|
|
|
int target_index = 0;
|
|
|
|
// Linear search for the target insertion point.
|
|
|
|
while (target_index < modes->size() &&
|
|
|
|
(*modes)[target_index].data >= total_count)
|
|
|
|
++target_index;
|
|
|
|
float peak_mean =
|
|
|
|
static_cast<float>(total_value / total_count + rangemin_);
|
|
|
|
modes->insert(KDPairInc<float, int>(peak_mean, total_count),
|
|
|
|
target_index);
|
|
|
|
least_count = modes->back().data;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} while (max_count > 0);
|
|
|
|
return modes->size();
|
|
|
|
}
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* STATS::print
|
|
|
|
*
|
2011-03-22 05:44:45 +08:00
|
|
|
* Prints a summary and table of the histogram.
|
2007-03-08 04:03:40 +08:00
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
void STATS::print() const {
|
|
|
|
if (buckets_ == NULL) {
|
2007-03-08 04:03:40 +08:00
|
|
|
return;
|
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
inT32 min = min_bucket() - rangemin_;
|
|
|
|
inT32 max = max_bucket() - rangemin_;
|
|
|
|
|
|
|
|
int num_printed = 0;
|
|
|
|
for (int index = min; index <= max; index++) {
|
|
|
|
if (buckets_[index] != 0) {
|
|
|
|
tprintf("%4d:%-3d ", rangemin_ + index, buckets_[index]);
|
|
|
|
if (++num_printed % 8 == 0)
|
2007-03-08 04:03:40 +08:00
|
|
|
tprintf ("\n");
|
|
|
|
}
|
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
tprintf ("\n");
|
|
|
|
print_summary();
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
2011-03-22 05:44:45 +08:00
|
|
|
* STATS::print_summary
|
2007-03-08 04:03:40 +08:00
|
|
|
*
|
2011-03-22 05:44:45 +08:00
|
|
|
* Print a summary of the stats.
|
2007-03-08 04:03:40 +08:00
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
void STATS::print_summary() const {
|
|
|
|
if (buckets_ == NULL) {
|
2007-03-08 04:03:40 +08:00
|
|
|
return;
|
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
inT32 min = min_bucket();
|
|
|
|
inT32 max = max_bucket();
|
|
|
|
tprintf("Total count=%d\n", total_count_);
|
|
|
|
tprintf("Min=%.2f Really=%d\n", ile(0.0), min);
|
|
|
|
tprintf("Lower quartile=%.2f\n", ile(0.25));
|
|
|
|
tprintf("Median=%.2f, ile(0.5)=%.2f\n", median(), ile(0.5));
|
|
|
|
tprintf("Upper quartile=%.2f\n", ile(0.75));
|
|
|
|
tprintf("Max=%.2f Really=%d\n", ile(1.0), max);
|
|
|
|
tprintf("Range=%d\n", max + 1 - min);
|
|
|
|
tprintf("Mean= %.2f\n", mean());
|
|
|
|
tprintf("SD= %.2f\n", sd());
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::plot
|
|
|
|
*
|
|
|
|
* Draw a histogram of the stats table.
|
|
|
|
**********************************************************************/
|
|
|
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
2011-03-22 05:44:45 +08:00
|
|
|
void STATS::plot(ScrollView* window, // to draw in
|
|
|
|
float xorigin, // bottom left
|
2007-03-08 04:03:40 +08:00
|
|
|
float yorigin,
|
2011-03-22 05:44:45 +08:00
|
|
|
float xscale, // one x unit
|
|
|
|
float yscale, // one y unit
|
|
|
|
ScrollView::Color colour) const { // colour to draw in
|
|
|
|
if (buckets_ == NULL) {
|
2007-03-08 04:03:40 +08:00
|
|
|
return;
|
|
|
|
}
|
2008-04-22 08:41:37 +08:00
|
|
|
window->Pen(colour);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2011-03-22 05:44:45 +08:00
|
|
|
for (int index = 0; index < rangemax_ - rangemin_; index++) {
|
2008-02-01 08:36:18 +08:00
|
|
|
window->Rectangle( xorigin + xscale * index, yorigin,
|
2007-03-08 04:03:40 +08:00
|
|
|
xorigin + xscale * (index + 1),
|
2011-03-22 05:44:45 +08:00
|
|
|
yorigin + yscale * buckets_[index]);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* STATS::plotline
|
|
|
|
*
|
2011-03-22 05:44:45 +08:00
|
|
|
* Draw a histogram of the stats table. (Line only)
|
2007-03-08 04:03:40 +08:00
|
|
|
**********************************************************************/
|
|
|
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
2011-03-22 05:44:45 +08:00
|
|
|
void STATS::plotline(ScrollView* window, // to draw in
|
|
|
|
float xorigin, // bottom left
|
2007-03-08 04:03:40 +08:00
|
|
|
float yorigin,
|
2011-03-22 05:44:45 +08:00
|
|
|
float xscale, // one x unit
|
|
|
|
float yscale, // one y unit
|
|
|
|
ScrollView::Color colour) const { // colour to draw in
|
|
|
|
if (buckets_ == NULL) {
|
2007-03-08 04:03:40 +08:00
|
|
|
return;
|
|
|
|
}
|
2008-02-01 08:36:18 +08:00
|
|
|
window->Pen(colour);
|
2011-03-22 05:44:45 +08:00
|
|
|
window->SetCursor(xorigin, yorigin + yscale * buckets_[0]);
|
|
|
|
for (int index = 0; index < rangemax_ - rangemin_; index++) {
|
|
|
|
window->DrawTo(xorigin + xscale * index,
|
|
|
|
yorigin + yscale * buckets_[index]);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* choose_nth_item
|
|
|
|
*
|
|
|
|
* Returns the index of what would b the nth item in the array
|
|
|
|
* if the members were sorted, without actually sorting.
|
|
|
|
**********************************************************************/
|
|
|
|
|
2011-03-22 05:44:45 +08:00
|
|
|
inT32 choose_nth_item(inT32 index, float *array, inT32 count) {
|
|
|
|
inT32 next_sample; // next one to do
|
|
|
|
inT32 next_lesser; // space for new
|
|
|
|
inT32 prev_greater; // last one saved
|
|
|
|
inT32 equal_count; // no of equal ones
|
|
|
|
float pivot; // proposed median
|
|
|
|
float sample; // current sample
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (count <= 1)
|
|
|
|
return 0;
|
|
|
|
if (count == 2) {
|
|
|
|
if (array[0] < array[1]) {
|
|
|
|
return index >= 1 ? 1 : 0;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return index >= 1 ? 0 : 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (index < 0)
|
2011-03-22 05:44:45 +08:00
|
|
|
index = 0; // ensure legal
|
2007-03-08 04:03:40 +08:00
|
|
|
else if (index >= count)
|
|
|
|
index = count - 1;
|
2010-11-24 02:34:14 +08:00
|
|
|
equal_count = (inT32) (rand() % count);
|
2007-03-08 04:03:40 +08:00
|
|
|
pivot = array[equal_count];
|
2011-03-22 05:44:45 +08:00
|
|
|
// fill gap
|
2007-03-08 04:03:40 +08:00
|
|
|
array[equal_count] = array[0];
|
|
|
|
next_lesser = 0;
|
|
|
|
prev_greater = count;
|
|
|
|
equal_count = 1;
|
|
|
|
for (next_sample = 1; next_sample < prev_greater;) {
|
|
|
|
sample = array[next_sample];
|
|
|
|
if (sample < pivot) {
|
2011-03-22 05:44:45 +08:00
|
|
|
// shuffle
|
2007-03-08 04:03:40 +08:00
|
|
|
array[next_lesser++] = sample;
|
|
|
|
next_sample++;
|
|
|
|
}
|
|
|
|
else if (sample > pivot) {
|
|
|
|
prev_greater--;
|
2011-03-22 05:44:45 +08:00
|
|
|
// juggle
|
2007-03-08 04:03:40 +08:00
|
|
|
array[next_sample] = array[prev_greater];
|
|
|
|
array[prev_greater] = sample;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
equal_count++;
|
|
|
|
next_sample++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (next_sample = next_lesser; next_sample < prev_greater;)
|
|
|
|
array[next_sample++] = pivot;
|
|
|
|
if (index < next_lesser)
|
|
|
|
return choose_nth_item (index, array, next_lesser);
|
|
|
|
else if (index < prev_greater)
|
2011-03-22 05:44:45 +08:00
|
|
|
return next_lesser; // in equal bracket
|
2007-03-08 04:03:40 +08:00
|
|
|
else
|
|
|
|
return choose_nth_item (index - prev_greater,
|
|
|
|
array + prev_greater,
|
|
|
|
count - prev_greater) + prev_greater;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* choose_nth_item
|
|
|
|
*
|
2011-03-22 05:44:45 +08:00
|
|
|
* Returns the index of what would be the nth item in the array
|
2007-03-08 04:03:40 +08:00
|
|
|
* if the members were sorted, without actually sorting.
|
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
inT32 choose_nth_item(inT32 index, void *array, inT32 count, size_t size,
|
|
|
|
int (*compar)(const void*, const void*)) {
|
|
|
|
int result; // of compar
|
|
|
|
inT32 next_sample; // next one to do
|
|
|
|
inT32 next_lesser; // space for new
|
|
|
|
inT32 prev_greater; // last one saved
|
|
|
|
inT32 equal_count; // no of equal ones
|
|
|
|
inT32 pivot; // proposed median
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
if (count <= 1)
|
|
|
|
return 0;
|
|
|
|
if (count == 2) {
|
|
|
|
if (compar (array, (char *) array + size) < 0) {
|
|
|
|
return index >= 1 ? 1 : 0;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return index >= 1 ? 0 : 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (index < 0)
|
2011-03-22 05:44:45 +08:00
|
|
|
index = 0; // ensure legal
|
2007-03-08 04:03:40 +08:00
|
|
|
else if (index >= count)
|
|
|
|
index = count - 1;
|
2008-04-22 08:41:37 +08:00
|
|
|
pivot = (inT32) (rand () % count);
|
2007-03-08 04:03:40 +08:00
|
|
|
swap_entries (array, size, pivot, 0);
|
|
|
|
next_lesser = 0;
|
|
|
|
prev_greater = count;
|
|
|
|
equal_count = 1;
|
|
|
|
for (next_sample = 1; next_sample < prev_greater;) {
|
|
|
|
result =
|
|
|
|
compar ((char *) array + size * next_sample,
|
|
|
|
(char *) array + size * next_lesser);
|
|
|
|
if (result < 0) {
|
|
|
|
swap_entries (array, size, next_lesser++, next_sample++);
|
2011-03-22 05:44:45 +08:00
|
|
|
// shuffle
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
else if (result > 0) {
|
|
|
|
prev_greater--;
|
2008-04-22 08:41:37 +08:00
|
|
|
swap_entries(array, size, prev_greater, next_sample);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
equal_count++;
|
|
|
|
next_sample++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (index < next_lesser)
|
|
|
|
return choose_nth_item (index, array, next_lesser, size, compar);
|
|
|
|
else if (index < prev_greater)
|
2011-03-22 05:44:45 +08:00
|
|
|
return next_lesser; // in equal bracket
|
2007-03-08 04:03:40 +08:00
|
|
|
else
|
|
|
|
return choose_nth_item (index - prev_greater,
|
|
|
|
(char *) array + size * prev_greater,
|
|
|
|
count - prev_greater, size,
|
|
|
|
compar) + prev_greater;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* swap_entries
|
|
|
|
*
|
2011-03-22 05:44:45 +08:00
|
|
|
* Swap 2 entries of arbitrary size in-place in a table.
|
2007-03-08 04:03:40 +08:00
|
|
|
**********************************************************************/
|
2011-03-22 05:44:45 +08:00
|
|
|
void swap_entries(void *array, // array of entries
|
|
|
|
size_t size, // size of entry
|
|
|
|
inT32 index1, // entries to swap
|
2008-04-22 08:41:37 +08:00
|
|
|
inT32 index2) {
|
2007-03-08 04:03:40 +08:00
|
|
|
char tmp;
|
2011-03-22 05:44:45 +08:00
|
|
|
char *ptr1; // to entries
|
2007-03-08 04:03:40 +08:00
|
|
|
char *ptr2;
|
2011-03-22 05:44:45 +08:00
|
|
|
size_t count; // of bytes
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2017-05-11 06:40:31 +08:00
|
|
|
ptr1 = static_cast<char*>(array) + index1 * size;
|
|
|
|
ptr2 = static_cast<char*>(array) + index2 * size;
|
2007-03-08 04:03:40 +08:00
|
|
|
for (count = 0; count < size; count++) {
|
|
|
|
tmp = *ptr1;
|
|
|
|
*ptr1++ = *ptr2;
|
2011-03-22 05:44:45 +08:00
|
|
|
*ptr2++ = tmp; // tedious!
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|