PROBI  1.0
 All Classes Functions
Sampling.hpp
1 #ifndef SAMPLING_H
2 #define SAMPLING_H
3 
4 #include <vector>
5 #include <random>
6 #include <memory>
7 #include <map>
8 #include <set>
9 #include <iostream>
10 #include "Helper.hpp"
11 #include "Randomness.hpp"
12 
16 class Sampling
17 {
18 public:
19 
25  template<typename InputIterator, typename ResultType>
26  static std::unique_ptr<std::vector<ResultType>> sampleWithoutReplacement(InputIterator first, InputIterator last, size_t sizeOfSample);
27 
33  template<typename RandomAccessIterator, typename ResultType>
34  static std::unique_ptr<std::vector<ResultType>> sampleWithoutReplacementFast(RandomAccessIterator first, RandomAccessIterator last, size_t sizeOfSample);
35 
41  template<typename InputIterator, typename ResultType>
42  static std::unique_ptr<std::vector<ResultType>> sampleWithReplacement(InputIterator first, InputIterator last, std::vector<double> & weights, size_t sizeOfSample);
43 
49  template<typename InputIterator, typename ResultType>
50  static std::unique_ptr<std::vector<ResultType>> sampleWithReplacement(InputIterator first, InputIterator last, size_t sizeOfSet, size_t sizeOfSample);
51 };
52 
53 template<typename InputIterator, typename ResultType>
54 std::unique_ptr<std::vector<ResultType>> Sampling::sampleWithoutReplacement(InputIterator first, InputIterator last, size_t sizeOfSample)
55 {
56  std::unique_ptr<std::vector<ResultType>> buckets(new std::vector<ResultType>(sizeOfSample));
57  std::mt19937 * gen = Randomness::getMT19937();
58  std::uniform_real_distribution<double> sampleDice(0, 1);
59  std::uniform_int_distribution<size_t> bucketDice(0, sizeOfSample-1);
60 
61  // Take first (sizeOfSample) elements as initial sample
62  size_t numberOfElements = 0;
63  for(auto it = buckets->begin(); it != buckets->end() && first != last; ++it)
64  {
65  *it = *(first++);
66  ++numberOfElements;
67  }
68 
69  // Less elements than buckets?
70  if(sizeOfSample > numberOfElements)
71  return std::unique_ptr<std::vector<ResultType>>(new std::vector<ResultType>(buckets->begin(), buckets->begin()+numberOfElements));
72 
73  // Sample from remaining (n+1)...(last) elements
74  size_t index = sizeOfSample+1;
75  while(first != last)
76  {
77  if(sampleDice(*gen) < double(sizeOfSample)/double(index))
78  (*buckets)[bucketDice(*gen)] = *first;
79  ++first;
80  ++index;
81  }
82 
83  return buckets;
84 }
85 
86 template<typename RandomAccessIterator, typename ResultType>
87 std::unique_ptr<std::vector<ResultType>> Sampling::sampleWithoutReplacementFast(RandomAccessIterator first, RandomAccessIterator last, size_t sizeOfSample)
88 {
89  unsigned long long n = last-first;
90 
91  if(n <= sizeOfSample)
92  return std::unique_ptr<std::vector<ResultType>>(new std::vector<ResultType>(first, last));
93 
94  std::mt19937 * gen = Randomness::getMT19937();
95  std::uniform_real_distribution<double> sampleDice(0, n-1);
96  std::set<size_t> selected;
97  std::vector<size_t> indices;
98  indices.reserve(sizeOfSample);
99  while(selected.size() < sizeOfSample)
100  {
101  size_t element = sampleDice(*gen);
102  if(selected.count(element) == 0)
103  {
104  selected.insert(element);
105  indices.push_back(element);
106  }
107  }
108 
109  std::unique_ptr<std::vector<ResultType>> buckets(new std::vector<ResultType>());
110  buckets->reserve(sizeOfSample);
111 
112  for(auto it = indices.begin(); it != indices.end(); ++it)
113  buckets->push_back(*(first + *it));
114 
115  return buckets;
116 }
117 
118 template<typename InputIterator, typename ResultType>
119 std::unique_ptr<std::vector<ResultType>> Sampling::sampleWithReplacement(InputIterator first, InputIterator last, std::vector<double> & weights, size_t sizeOfSample)
120 {
121  std::mt19937 * gen = Randomness::getMT19937();
122  std::discrete_distribution<size_t> d(weights.begin(), weights.end());
123 
124  // Sample
125  std::multimap<size_t, size_t> samples;
126  for(size_t i = 0; i < sizeOfSample; ++i)
127  samples.insert(std::pair<size_t,size_t>(d(*gen), i));
128 
129  // Construct sample set
130  std::unique_ptr<std::vector<ResultType>> buckets(new std::vector<ResultType>(sizeOfSample));
131  size_t index = 0;
132  for(auto it = first; it != last; ++it)
133  {
134  auto buck = samples.equal_range(index);
135  for(auto itbuck = buck.first; itbuck != buck.second; ++itbuck)
136  (*buckets)[itbuck->second] = *it;
137  ++index;
138  }
139 
140  return buckets;
141 }
142 
143 template<typename InputIterator, typename ResultType>
144 std::unique_ptr<std::vector<ResultType>> Sampling::sampleWithReplacement(InputIterator first, InputIterator last, size_t sizeOfSet, size_t sizeOfSample)
145 {
146  std::mt19937 * gen = Randomness::getMT19937();
147  std::uniform_int_distribution<size_t> d(0, sizeOfSet-1);
148 
149  // Sample
150  std::multimap<size_t, size_t> samples;
151  for(size_t i = 0; i < sizeOfSample; ++i)
152  samples.insert(std::pair<size_t,size_t>(d(*gen), i));
153 
154  // Construct sample set
155  std::unique_ptr<std::vector<ResultType>> buckets(new std::vector<ResultType>(sizeOfSample));
156  size_t index = 0;
157  for(auto it = first; it != last; ++it)
158  {
159  auto buck = samples.equal_range(index);
160  for(auto itbuck = buck.first; itbuck != buck.second; ++itbuck)
161  (*buckets)[itbuck->second] = *it;
162  ++index;
163  }
164 
165  return buckets;
166 }
167 
168 #endif