PROBI  1.0
 All Classes Functions
FastCoreset.hpp
1 #ifndef FASTCORESET_HPP
2 #define FASTCORESET_HPP
3 
4 #include <vector>
5 #include <unordered_map>
6 #include <iterator>
7 
8 #include "Metric.hpp"
9 #include "Sampling.hpp"
10 #include "Weiszfeld.hpp"
11 #include "CenterOfGravity.hpp"
12 #include "KumarMedian.hpp"
13 #include "LloydMedian.hpp"
14 #include "PKMedian.hpp"
15 #include "ProbabilisticPoint.hpp"
16 
23 {
24 private:
25  Weiszfeld weiszfeld;
26  CenterOfGravity centerOfGravity;
27  KumarMedian kumar;
28  LloydMedian lloyd;
29  PKMedian pkmedian;
30  Metric<Point>* metric;
31  int k;
32  int weiszfeldMedianIterations;
33  int kumarMedianIterations;
34  int maxLloydClusteringIterations;
35  int allSamplesSize;
36 public:
37  FastCoreset(std::function<Metric<Point>*() > createMetric, std::function<Norm<Point>*() > createNorm);
38  virtual ~FastCoreset();
39 
44  void setK(int k);
45 
50  int getK() const;
51 
56  void setWeiszfeldMedianIterations(int weiszfeldMedianIterations);
57 
62  int getWeiszfeldMedianIterations() const;
63 
68  void setMaxLloydClusteringIterations(int maxLloydClusteringIterations);
69 
75 
80  void setKumarMedianIterations(int kumarMedianIterations);
81 
86  int getKumarMedianIterations() const;
87 
92  void setAllSamplesSize(int allSamplesSize);
93 
98  int getAllSamplesSize() const;
99 
108  template<typename Iterator1, typename Iterator2>
109  void computeCoreset(Iterator1 inputBegin, Iterator1 inputEnd, Iterator2 output, size_t n = 0);
110 
111 private:
112  template<typename RandomAccessIterator>
113  std::unique_ptr<std::vector<std::vector<std::vector<std::vector<std::pair<size_t, size_t >> >> >> partition(RandomAccessIterator begin, RandomAccessIterator end, std::vector<WeightedPoint>& medians, std::vector<Point>& centers, double radius, size_t n = 0);
114 
115  template<typename RandomAccessIterator>
116  std::unique_ptr<std::vector<std::pair<size_t, double >> > sampleCoreset(RandomAccessIterator begin, RandomAccessIterator end, std::vector<std::vector<std::vector<std::vector<std::pair<size_t, size_t >> >> > const & partitions, size_t sampleSize);
117 
118  template<typename RandomAccessIterator>
119  ProbabilisticPoint const & input(RandomAccessIterator begin, size_t index)
120  {
121  return *(begin + index);
122  }
123 };
124 
125 template<typename RandomAccessIterator1, typename Iterator2>
126 void FastCoreset::computeCoreset(RandomAccessIterator1 inputBegin, RandomAccessIterator1 inputEnd, Iterator2 output, size_t n)
127 {
128  if (n == 0)
129  for (RandomAccessIterator1 it = inputBegin; it != inputEnd; ++it)
130  ++n;
131  double W = 0;
132 
133  // 1. Compute 1-median of each (probabilistic) input point
134  std::vector<WeightedPoint> medians;
135  medians.reserve(n);
136  for (RandomAccessIterator1 it = inputBegin; it != inputEnd; ++it)
137  {
138  Point p;
139 #if KMEANS
140  p = centerOfGravity.cog(it->cbegin(), it->cend());
141 #else
142  try
143  {
144  p = weiszfeld.approximateOneMedian(it->cbegin(), it->cend(), weiszfeldMedianIterations);
145  }
146  catch (Weiszfeld::IterationFailed err)
147  {
148  p = kumar.approximateOneMedianRounds(it->cbegin(), it->cend(), 0.9999999999, kumarMedianIterations);
149  }
150 #endif
151  medians.push_back(p);
152  medians[medians.size() - 1].setWeight(it->getWeight());
153  }
154 
155  // 2. Cluster 1-medians
156  std::vector<Point> centers;
157  centers.reserve(k);
158  lloyd.computeCenterSet(medians.begin(), medians.end(), std::back_inserter(centers), k, maxLloydClusteringIterations, n);
159  double cost = pkmedian.weightedCost(inputBegin, inputEnd, centers.begin(), centers.end());
160 
161  // 3. Partitioning and sampling
162  double radius = cost / W; //TODO value of radius
163  std::unique_ptr < std::vector < std::vector < std::vector < std::vector < std::pair<size_t, size_t >> >> >> partitions =
164  partition(inputBegin, inputEnd, medians, centers, radius, n);
165  size_t usedRings = 0;
166  for (size_t l = 0; l < partitions->size(); ++l)
167  {
168  std::vector < std::vector < std::vector < std::pair<size_t, size_t >> >> const & L = (*partitions)[l];
169  for (size_t h = 0; h < L.size(); ++h)
170  {
171  std::vector < std::vector < std::pair<size_t, size_t >> > const & H = L[h];
172  for (size_t a = 0; a < H.size(); ++a)
173  {
174  std::vector < std::pair<size_t, size_t >> const & P = H[a];
175  if (P.size() > 0)
176  ++usedRings;
177  }
178  }
179  }
180  int sampleSize = allSamplesSize / usedRings;
181  if(sampleSize == 0)
182  sampleSize = 1;
183  std::unique_ptr < std::vector < std::pair<size_t, double >> > samples =
184  sampleCoreset(inputBegin, inputEnd, *partitions, sampleSize);
185 
186  // Write coreset to output iterator
187  for (size_t i = 0; i < samples->size(); ++i)
188  {
189  ProbabilisticPoint pp(*(inputBegin + i));
190  pp.setWeight((*samples)[i].second);
191  *output = pp;
192  ++output;
193  }
194 }
195 
196 template<typename RandomAccessIterator>
197 std::unique_ptr<std::vector<std::vector<std::vector<std::vector<std::pair<size_t, size_t >> >> >> FastCoreset::partition(RandomAccessIterator begin, RandomAccessIterator end, std::vector<WeightedPoint>& medians, std::vector<Point>& centers, double radius, size_t n)
198 {
199  if (n == 0)
200  for (RandomAccessIterator it = begin; it != end; ++it)
201  ++n;
202 
203  std::unique_ptr < std::vector < std::vector < std::vector < std::vector < std::pair<size_t, size_t >> >> >> partitions(new std::vector < std::vector < std::vector < std::vector < std::pair<size_t, size_t >> >> >());
204 
205  for (size_t i = 0; i < medians.size(); ++i)
206  {
207  // Determine l
208  size_t l = 0;
209  double dist = 0;
210  {
211  for (size_t j = 0; j < centers.size(); ++j)
212  {
213  double tmpDist = metric->distance(medians[i], centers[j]);
214  if (tmpDist < dist || j == 0)
215  {
216  dist = tmpDist;
217  l = j;
218  }
219  }
220  }
221 
222  // Determine h
223  size_t h = 0;
224  {
225  if (dist > radius)
226  h = std::ceil(std::abs(std::log2(dist / radius)));
227  }
228 
229  // Determine a
230  size_t a = 0;
231  {
232  double width = 0;
233  ProbabilisticPoint const & v(input(begin, i));
234  for (auto it = v.cbegin(); it != v.cend(); ++it)
235  width += it->getWeight() * metric->distance(*it, medians[i]);
236  width /= v.getRealizationProbability();
237  if (width > radius)
238  a = std::ceil(std::abs(std::log2(width / radius)));
239  }
240 
241  // Allocate space and push index to ring (l,h,a)
242  if ((*partitions).size() < l + 1)
243  (*partitions).resize(l + 1);
244  if ((*partitions)[l].size() < h + 1)
245  (*partitions)[l].resize(h + 1);
246  if ((*partitions)[l][h].size() < a + 1)
247  (*partitions)[l][h].resize(a + 1);
248  (*partitions)[l][h][a].push_back(std::pair<size_t, size_t>(i, l));
249  }
250 
251  return partitions;
252 }
253 
254 template<typename RandomAccessIterator>
255 std::unique_ptr<std::vector<std::pair<size_t, double >> > FastCoreset::sampleCoreset(RandomAccessIterator begin, RandomAccessIterator end, std::vector<std::vector<std::vector<std::vector<std::pair<size_t, size_t >> >> > const & partitions, size_t sampleSize)
256 {
257  std::unordered_map<size_t, size_t> sampledPoints;
258  std::unique_ptr < std::vector < std::pair<size_t, double >> > coreset(new std::vector < std::pair<size_t, double >>);
259  for (size_t l = 0; l < partitions.size(); ++l)
260  {
261  std::vector < std::vector < std::vector < std::pair<size_t, size_t >> >> const & L = partitions[l];
262  for (size_t h = 0; h < L.size(); ++h)
263  {
264  std::vector < std::vector < std::pair<size_t, size_t >> > const & H = L[h];
265  for (size_t a = 0; a < H.size(); ++a)
266  {
267  std::vector < std::pair<size_t, size_t >> const & P = H[a];
268 
269  // Sampling probabilities
270  double sum = 0;
271  for (size_t p = 0; p < P.size(); ++p)
272  sum += input(begin, P[p].first).getWeight() * input(begin, P[p].first).getRealizationProbability();
273  std::vector<double> prob(P.size());
274  for (size_t p = 0; p < P.size(); ++p)
275  prob[p] = input(begin, P[p].first).getWeight() * input(begin, P[p].first).getRealizationProbability();
276 
277  // Do sampling
278  std::unique_ptr < std::vector < std::pair<size_t, size_t >> > sample(Sampling::sampleWithReplacement < std::vector < std::pair<size_t, size_t >> ::const_iterator, std::pair<size_t, size_t >> (P.begin(), P.end(), prob, sampleSize));
279 
280  // Return point and weight
281  for (size_t s = 0; s < sample->size(); ++s)
282  {
283  size_t index = (*sample)[s].first;
284  double weight = sum / (input(begin, (*sample)[s].first).getRealizationProbability() * sampleSize);
285  // Check if point was sampled before
286  if (sampledPoints.count(index) > 0)
287  {
288  (*coreset)[sampledPoints[index]].second += weight;
289  }
290  else
291  {
292  sampledPoints.insert(std::pair<size_t, size_t>(index, coreset->size()));
293  coreset->push_back(std::pair<size_t, double>(index, weight));
294  }
295  }
296  }
297  }
298  }
299  return coreset;
300 }
301 
302 #endif /* FASTCORESET_HPP */
303