/* Reservoir Sampling
 *
 * This blog post is a valid C99 program - you can compile it and run it, and
 * try it out:
 *
 *   cc -x c -o reservoir post.txt
 *   seq 1 1000 | ./reservoir 15
 *
 * This is one of those neat little algorithms that comes up surprisingly often.
 * The problem it solves is straightforward: you have an input list L of n
 * elements, and you want to choose k of them uniformly at random. One obvious
 * approach is to randomly shuffle L, then take the first k elements afterwards,
 * but to do that you have to have all of L available in memory at the same
 * time. What if L is too big for that, or doesn't all exist at the same time -
 * like if you are trying to randomly sample events passing through an event
 * logging system or something?
 *
 * Reservoir sampling solves that problem. The basic idea is very simple: you
 * keep a "reservoir" of up to k items, and whenever you see a new item, if the
 * reservoir is not full, you add it. If the reservoir is full, you add it to
 * the reservoir with probability k / n, replacing one of the existing items
 * uniformly at random. A common way of implementing this is that, for each
 * element L[i] of the list you see, if there are already k items in the
 * reservoir R[0] ... R[k-1], you choose r = random-in({0, 1, ..., i}), and if r
 * < k, replace R[r] with L[i].
 *
 * A proof that this algorithm works is not that difficult. You can do it by
 * induction on the number of elements considered so far, showing that after
 * considering each element i, the probability that each of L[0] ... L[i] is
 * included in the reservoir is k / i. I won't do it here but you can work it
 * out yourself if you want - exercise to the reader, &c :).
 *
 * Here's an implementation in C as well, although there's really not that much
 * code here, because the algorithm is so simple.
 */

#include <err.h>
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

/* Random number generation is easy to get almost right and irritatingly hard to
 * get actually right. In particular, it's easy to accidentally introduce small
 * biases by doing things the obvious way.
 *
 * Notational aside: below we'll use [] and () to describe ranges of numbers.
 * Conventionally [] are for "closed" ends and () are for "open" ends, so for
 * example: [1,5] means {1, 2, 3, 4, 5}, [1,5) means {1, 2, 3, 4}, and (1,5)
 * means {2, 3, 4}. This is also useful when working with real numbers:
 * (1.0,2.0) includes all reals between 1.0 and 2.0 but not 1.0 or 2.0
 * themselves, while [1.0,2.0] includes the endpoints too.
 *
 * The naive approach to generating a random number in [0, n) is to take
 * random() % n, which works well as long as n evenly divides the size of the
 * range of random(). If it doesn't, biases creep in at the top of the range.
 *
 * Here's an illustrated example: let's suppose that we had a 4-bit random
 * number generator randnibble() which produced a value in [0, 16) with equal
 * probability. If we took the output of that generator modulo 4, we'd find that
 * the possible outputs are:
 *
 *   0 -> 0   1 -> 1   2 -> 2   3 -> 3
 *   4 -> 0   5 -> 1   6 -> 2   7 -> 3
 *   8 -> 0   9 -> 1  10 -> 2  11 -> 3
 *  12 -> 0  13 -> 1  14 -> 2  15 -> 3
 *
 * so we can see that, if the generator's result is indeed uniformly distributed
 * between [0, 16), then the output is one of { 0, 1, 2, 3 } with equal
 * probability.
 *
 * However, if we took the output modulo 3 instead, we'd get a bias:
 *
 *   0 -> 0   1 -> 1   2 -> 2
 *   3 -> 0   4 -> 1   5 -> 2
 *   6 -> 0   7 -> 1   8 -> 2
 *   9 -> 0  10 -> 1  11 -> 2
 *  12 -> 0  13 -> 1  14 -> 2
 *  15 -> 0
 *
 * So randnibble() % 3 produces 0 with probability 6/16, 1 with probability
 * 5/16, and 2 with probability 5/16.
 *
 * If we do the same thing with, say, random() (which produces a 64-bit random
 * value on my system) and take its result modulo 3, we'd find that there are
 * (2**64 / 3) random() outputs that yield 1, (2**64 / 3) random() outputs that
 * yield 2, and (2**64 / 3) + 1 random() outputs that yield 0. Oops!
 *
 * There are a few ways around that, but one of the most straightforward is to
 * use our original RNG to produce a second RNG which has a range that *is*
 * divisible by the desired value. For example, we could do this:
 *
 *   do { r = random() } while (r >= range);
 *
 * which would effectively keep producing random numbers until one of them was
 * inside the range. Since each value of random() has equal probability, each
 * value <= range would also have equal probability. The downside of doing this
 * is that it could take quite a while to hit the desired value (!). We can help
 * with that by expanding the range we're willing to accept from the loop; as
 * long as we pick a search range which is evenly divisible by the target range,
 * we're good to go.
 *
 * To do that, we can compute the largest multiple of the target range which is
 * less than or equal to the rng's native range, and use that as our search
 * range. For example, if we were using randnibble() and wanted a number modulo
 * 3, we could make our search range 15. That would give us this frequency
 * table:
 *
 *   0 -> 0   1 -> 1   2 -> 2
 *   3 -> 0   4 -> 1   5 -> 2
 *   6 -> 0   7 -> 1   8 -> 2
 *   9 -> 0  10 -> 1  11 -> 2
 *  12 -> 0  13 -> 1  14 -> 2
 *
 * which is uniformly distributed. All we'd need to do would be this:
 *
 *   do { r = randnibble() } while (r >= 15);
 *   return r % 3;
 *
 * There *is* an annoying catch to this though which is that our RNG is no
 * longer guaranteed to actually terminate! It will terminate probabilistically,
 * but there are certain RNG outputs which will never terminate. That's a bit of
 * a bummer, but that's the price you pay for better randomness I guess.
 *
 * So, that in mind, here's the first primitive we need: choosing an index in
 * [0, n) uniformly at random. We'll use exactly the technique described above:
 */
static size_t index_upto(size_t n) {
	size_t cap = (LONG_MAX / n) * n;
	unsigned long rv;

	do
		rv = random();
	while (rv >= cap);

	return rv % n;
}

struct reservoir {
	/* The reservoir of values itself - this has filled slots, numbered 0
	 * ... filled - 1. This is R[0] ... R[k - 1] in the description given
	 * above. */
	int64_t *vals;

	/* This is k in the algorithm - the capacity of the reservoir. */
	size_t cap;

	/* And this is n in the algorithm - the total number of elements of L
	 * considered so far. */
	size_t seen;
};

void reservoir_init(struct reservoir *rs, size_t cap) {
	rs->cap = cap;
	rs->seen = 0;
	rs->vals = calloc(cap, sizeof *rs->vals);
	if (!rs->vals)
		errx(1, "out of memory :(");
}

void reservoir_sample(struct reservoir *rs, int64_t v) {
	size_t r;

	rs->seen++;
	/* Reservoir's not full yet - put v into it and return. */
	if (rs->seen <= rs->cap) {
		rs->vals[rs->seen - 1] = v;
		return;
	}

	/* Reservoir is full - pick a random index in [0, seen). If it's inside
	 * the reservoir, replace that element. */
	r = index_upto(rs->seen);
	if (r <= rs->cap)
		rs->vals[r - 1] = v;
}

size_t min(size_t a, size_t b) {
	return a < b ? a : b;
}

/* All we do here is: allocate the reservoir with the size given on the command
 * line, read integers from stdin and stick them into the reservoir randomly,
 * then print the reservoir afterwards. */
int main(int argc, char *argv[]) {
	char line[64];
	struct reservoir rs;
	size_t i;

	if (argc < 2)
		errx(1, "usage: %s <reservoir size>", argv[0]);

	srandom(time(NULL));
	reservoir_init(&rs, strtoul(argv[1], NULL, 10));

	while (fgets(line, sizeof(line), stdin))
		reservoir_sample(&rs, strtol(line, NULL, 10));

	for (i = 0; i < min(rs.seen, rs.cap); i++)
		printf("%ld\n", rs.vals[i]);

	return 0;
}

/* $#t Reservoir sampling
 * $#s How to choose a random subset of a large set efficiently
 * $#o c, computer-science
 * $#u d23088e0-e9f3-60a2-ab15-12ef7c4ae175
 */