1 #ifndef FASTCORESET_HPP
2 #define FASTCORESET_HPP
5 #include <unordered_map>
9 #include "Sampling.hpp"
10 #include "Weiszfeld.hpp"
11 #include "CenterOfGravity.hpp"
12 #include "KumarMedian.hpp"
13 #include "LloydMedian.hpp"
14 #include "PKMedian.hpp"
15 #include "ProbabilisticPoint.hpp"
32 int weiszfeldMedianIterations;
33 int kumarMedianIterations;
34 int maxLloydClusteringIterations;
108 template<
typename Iterator1,
typename Iterator2>
109 void computeCoreset(Iterator1 inputBegin, Iterator1 inputEnd, Iterator2 output,
size_t n = 0);
112 template<
typename RandomAccessIterator>
113 std::unique_ptr<std::vector<std::vector<std::vector<std::vector<std::pair<size_t, size_t >> >> >> partition(RandomAccessIterator begin, RandomAccessIterator end, std::vector<WeightedPoint>& medians, std::vector<Point>& centers,
double radius,
size_t n = 0);
115 template<
typename RandomAccessIterator>
116 std::unique_ptr<std::vector<std::pair<size_t, double >> > sampleCoreset(RandomAccessIterator begin, RandomAccessIterator end, std::vector<std::vector<std::vector<std::vector<std::pair<size_t, size_t >> >> >
const & partitions,
size_t sampleSize);
118 template<
typename RandomAccessIterator>
121 return *(begin + index);
125 template<
typename RandomAccessIterator1,
typename Iterator2>
129 for (RandomAccessIterator1 it = inputBegin; it != inputEnd; ++it)
134 std::vector<WeightedPoint> medians;
136 for (RandomAccessIterator1 it = inputBegin; it != inputEnd; ++it)
140 p = centerOfGravity.
cog(it->cbegin(), it->cend());
151 medians.push_back(p);
152 medians[medians.size() - 1].setWeight(it->getWeight());
156 std::vector<Point> centers;
158 lloyd.
computeCenterSet(medians.begin(), medians.end(), std::back_inserter(centers), k, maxLloydClusteringIterations, n);
159 double cost = pkmedian.
weightedCost(inputBegin, inputEnd, centers.begin(), centers.end());
162 double radius = cost / W;
163 std::unique_ptr < std::vector < std::vector < std::vector < std::vector < std::pair<size_t, size_t >> >> >> partitions =
164 partition(inputBegin, inputEnd, medians, centers, radius, n);
165 size_t usedRings = 0;
166 for (
size_t l = 0; l < partitions->size(); ++l)
168 std::vector < std::vector < std::vector < std::pair<size_t, size_t >> >>
const & L = (*partitions)[l];
169 for (
size_t h = 0; h < L.size(); ++h)
171 std::vector < std::vector < std::pair<size_t, size_t >> >
const & H = L[h];
172 for (
size_t a = 0; a < H.size(); ++a)
174 std::vector < std::pair<size_t, size_t >>
const & P = H[a];
180 int sampleSize = allSamplesSize / usedRings;
183 std::unique_ptr < std::vector < std::pair<size_t, double >> > samples =
184 sampleCoreset(inputBegin, inputEnd, *partitions, sampleSize);
187 for (
size_t i = 0; i < samples->size(); ++i)
190 pp.setWeight((*samples)[i].second);
196 template<
typename RandomAccessIterator>
197 std::unique_ptr<std::vector<std::vector<std::vector<std::vector<std::pair<size_t, size_t >> >> >> FastCoreset::partition(RandomAccessIterator begin, RandomAccessIterator end, std::vector<WeightedPoint>& medians, std::vector<Point>& centers,
double radius,
size_t n)
200 for (RandomAccessIterator it = begin; it != end; ++it)
203 std::unique_ptr < std::vector < std::vector < std::vector < std::vector < std::pair<size_t, size_t >> >> >> partitions(
new std::vector < std::vector < std::vector < std::vector < std::pair<size_t, size_t >> >> >());
205 for (
size_t i = 0; i < medians.size(); ++i)
211 for (
size_t j = 0; j < centers.size(); ++j)
213 double tmpDist = metric->distance(medians[i], centers[j]);
214 if (tmpDist < dist || j == 0)
226 h = std::ceil(std::abs(std::log2(dist / radius)));
234 for (
auto it = v.cbegin(); it != v.cend(); ++it)
235 width += it->getWeight() * metric->distance(*it, medians[i]);
236 width /= v.getRealizationProbability();
238 a = std::ceil(std::abs(std::log2(width / radius)));
242 if ((*partitions).size() < l + 1)
243 (*partitions).resize(l + 1);
244 if ((*partitions)[l].size() < h + 1)
245 (*partitions)[l].resize(h + 1);
246 if ((*partitions)[l][h].size() < a + 1)
247 (*partitions)[l][h].resize(a + 1);
248 (*partitions)[l][h][a].push_back(std::pair<size_t, size_t>(i, l));
254 template<
typename RandomAccessIterator>
255 std::unique_ptr<std::vector<std::pair<size_t, double >> > FastCoreset::sampleCoreset(RandomAccessIterator begin, RandomAccessIterator end, std::vector<std::vector<std::vector<std::vector<std::pair<size_t, size_t >> >> >
const & partitions,
size_t sampleSize)
257 std::unordered_map<size_t, size_t> sampledPoints;
258 std::unique_ptr < std::vector < std::pair<size_t, double >> > coreset(
new std::vector < std::pair<size_t, double >>);
259 for (
size_t l = 0; l < partitions.size(); ++l)
261 std::vector < std::vector < std::vector < std::pair<size_t, size_t >> >>
const & L = partitions[l];
262 for (
size_t h = 0; h < L.size(); ++h)
264 std::vector < std::vector < std::pair<size_t, size_t >> >
const & H = L[h];
265 for (
size_t a = 0; a < H.size(); ++a)
267 std::vector < std::pair<size_t, size_t >>
const & P = H[a];
271 for (
size_t p = 0; p < P.size(); ++p)
272 sum += input(begin, P[p].first).getWeight() * input(begin, P[p].first).getRealizationProbability();
273 std::vector<double> prob(P.size());
274 for (
size_t p = 0; p < P.size(); ++p)
275 prob[p] = input(begin, P[p].first).getWeight() * input(begin, P[p].first).getRealizationProbability();
278 std::unique_ptr < std::vector < std::pair<size_t, size_t >> > sample(
Sampling::sampleWithReplacement < std::vector < std::pair<size_t, size_t >> ::const_iterator, std::pair<size_t, size_t >> (P.begin(), P.end(), prob, sampleSize));
281 for (
size_t s = 0; s < sample->size(); ++s)
283 size_t index = (*sample)[s].first;
284 double weight = sum / (input(begin, (*sample)[s].first).getRealizationProbability() * sampleSize);
286 if (sampledPoints.count(index) > 0)
288 (*coreset)[sampledPoints[index]].second += weight;
292 sampledPoints.insert(std::pair<size_t, size_t>(index, coreset->size()));
293 coreset->push_back(std::pair<size_t, double>(index, weight));