/* Reservoir Sampling * * This blog post is a valid C99 program - you can compile it and run it, and * try it out: * * cc -x c -o reservoir post.txt * seq 1 1000 | ./reservoir 15 * * This is one of those neat little algorithms that comes up surprisingly often. * The problem it solves is straightforward: you have an input list L of n * elements, and you want to choose k of them uniformly at random. One obvious * approach is to randomly shuffle L, then take the first k elements afterwards, * but to do that you have to have all of L available in memory at the same * time. What if L is too big for that, or doesn't all exist at the same time - * like if you are trying to randomly sample events passing through an event * logging system or something? * * Reservoir sampling solves that problem. The basic idea is very simple: you * keep a "reservoir" of up to k items, and whenever you see a new item, if the * reservoir is not full, you add it. If the reservoir is full, you add it to * the reservoir with probability k / n, replacing one of the existing items * uniformly at random. A common way of implementing this is that, for each * element L[i] of the list you see, if there are already k items in the * reservoir R[0] ... R[k-1], you choose r = random-in({0, 1, ..., i}), and if r * < k, replace R[r] with L[i]. * * A proof that this algorithm works is not that difficult. You can do it by * induction on the number of elements considered so far, showing that after * considering each element i, the probability that each of L[0] ... L[i] is * included in the reservoir is k / i. I won't do it here but you can work it * out yourself if you want - exercise to the reader, &c :). * * Here's an implementation in C as well, although there's really not that much * code here, because the algorithm is so simple. */ #include #include #include #include #include #include /* Random number generation is easy to get almost right and irritatingly hard to * get actually right. In particular, it's easy to accidentally introduce small * biases by doing things the obvious way. * * Notational aside: below we'll use [] and () to describe ranges of numbers. * Conventionally [] are for "closed" ends and () are for "open" ends, so for * example: [1,5] means {1, 2, 3, 4, 5}, [1,5) means {1, 2, 3, 4}, and (1,5) * means {2, 3, 4}. This is also useful when working with real numbers: * (1.0,2.0) includes all reals between 1.0 and 2.0 but not 1.0 or 2.0 * themselves, while [1.0,2.0] includes the endpoints too. * * The naive approach to generating a random number in [0, n) is to take * random() % n, which works well as long as n evenly divides the size of the * range of random(). If it doesn't, biases creep in at the top of the range. * * Here's an illustrated example: let's suppose that we had a 4-bit random * number generator randnibble() which produced a value in [0, 16) with equal * probability. If we took the output of that generator modulo 4, we'd find that * the possible outputs are: * * 0 -> 0 1 -> 1 2 -> 2 3 -> 3 * 4 -> 0 5 -> 1 6 -> 2 7 -> 3 * 8 -> 0 9 -> 1 10 -> 2 11 -> 3 * 12 -> 0 13 -> 1 14 -> 2 15 -> 3 * * so we can see that, if the generator's result is indeed uniformly distributed * between [0, 16), then the output is one of { 0, 1, 2, 3 } with equal * probability. * * However, if we took the output modulo 3 instead, we'd get a bias: * * 0 -> 0 1 -> 1 2 -> 2 * 3 -> 0 4 -> 1 5 -> 2 * 6 -> 0 7 -> 1 8 -> 2 * 9 -> 0 10 -> 1 11 -> 2 * 12 -> 0 13 -> 1 14 -> 2 * 15 -> 0 * * So randnibble() % 3 produces 0 with probability 6/16, 1 with probability * 5/16, and 2 with probability 5/16. * * If we do the same thing with, say, random() (which produces a 64-bit random * value on my system) and take its result modulo 3, we'd find that there are * (2**64 / 3) random() outputs that yield 1, (2**64 / 3) random() outputs that * yield 2, and (2**64 / 3) + 1 random() outputs that yield 0. Oops! * * There are a few ways around that, but one of the most straightforward is to * use our original RNG to produce a second RNG which has a range that *is* * divisible by the desired value. For example, we could do this: * * do { r = random() } while (r >= range); * * which would effectively keep producing random numbers until one of them was * inside the range. Since each value of random() has equal probability, each * value <= range would also have equal probability. The downside of doing this * is that it could take quite a while to hit the desired value (!). We can help * with that by expanding the range we're willing to accept from the loop; as * long as we pick a search range which is evenly divisible by the target range, * we're good to go. * * To do that, we can compute the largest multiple of the target range which is * less than or equal to the rng's native range, and use that as our search * range. For example, if we were using randnibble() and wanted a number modulo * 3, we could make our search range 15. That would give us this frequency * table: * * 0 -> 0 1 -> 1 2 -> 2 * 3 -> 0 4 -> 1 5 -> 2 * 6 -> 0 7 -> 1 8 -> 2 * 9 -> 0 10 -> 1 11 -> 2 * 12 -> 0 13 -> 1 14 -> 2 * * which is uniformly distributed. All we'd need to do would be this: * * do { r = randnibble() } while (r >= 15); * return r % 3; * * There *is* an annoying catch to this though which is that our RNG is no * longer guaranteed to actually terminate! It will terminate probabilistically, * but there are certain RNG outputs which will never terminate. That's a bit of * a bummer, but that's the price you pay for better randomness I guess. * * So, that in mind, here's the first primitive we need: choosing an index in * [0, n) uniformly at random. We'll use exactly the technique described above: */ static size_t index_upto(size_t n) { size_t cap = (LONG_MAX / n) * n; unsigned long rv; do rv = random(); while (rv >= cap); return rv % n; } struct reservoir { /* The reservoir of values itself - this has filled slots, numbered 0 * ... filled - 1. This is R[0] ... R[k - 1] in the description given * above. */ int64_t *vals; /* This is k in the algorithm - the capacity of the reservoir. */ size_t cap; /* And this is n in the algorithm - the total number of elements of L * considered so far. */ size_t seen; }; void reservoir_init(struct reservoir *rs, size_t cap) { rs->cap = cap; rs->seen = 0; rs->vals = calloc(cap, sizeof *rs->vals); if (!rs->vals) errx(1, "out of memory :("); } void reservoir_sample(struct reservoir *rs, int64_t v) { size_t r; rs->seen++; /* Reservoir's not full yet - put v into it and return. */ if (rs->seen <= rs->cap) { rs->vals[rs->seen - 1] = v; return; } /* Reservoir is full - pick a random index in [0, seen). If it's inside * the reservoir, replace that element. */ r = index_upto(rs->seen); if (r <= rs->cap) rs->vals[r - 1] = v; } size_t min(size_t a, size_t b) { return a < b ? a : b; } /* All we do here is: allocate the reservoir with the size given on the command * line, read integers from stdin and stick them into the reservoir randomly, * then print the reservoir afterwards. */ int main(int argc, char *argv[]) { char line[64]; struct reservoir rs; size_t i; if (argc < 2) errx(1, "usage: %s ", argv[0]); srandom(time(NULL)); reservoir_init(&rs, strtoul(argv[1], NULL, 10)); while (fgets(line, sizeof(line), stdin)) reservoir_sample(&rs, strtol(line, NULL, 10)); for (i = 0; i < min(rs.seen, rs.cap); i++) printf("%ld\n", rs.vals[i]); return 0; } /* $#t Reservoir sampling * $#s How to choose a random subset of a large set efficiently * $#o c, computer-science * $#u d23088e0-e9f3-60a2-ab15-12ef7c4ae175 */