/* Virtual Machines, Part 0 * * This blog post is also a valid C89 program which you can compile and run. * Compile it with: cc -x c -o vm post.txt * Then run it with: ./vm * * A virtual machine is a simulation of a computer. There are a lot of kinds of * virtual machines, but for the purposes of this post, we'll be looking at a * simulation of a very idealized version of a physical computer. A physical * computer has: * - a CPU, which executes instructions * - some memory, from which values are read and written * - and some way of talking to the outside world (an input-output system). * * Note that this is a vast simplification! * * Anyway, in our virtual machine, the memory will be stored in "host" memory, * and the CPU will be simulated using code running on the host. In this * instance, the host is just the machine on which the virtual machine simulator * is being run. The input-output system will input from and output to the * host's terminal. * * The CPU does the actual computation, and it executes a sequence of * instructions, which are things like "add these two values" or "if these two * values aren't equal, go to this instruction" or "output this value". These * instructions are usually stored in memory, just like the values they operate * on, although they can be stored somewhere else. For the sake of this example, * we'll store the instructions somewhere separate from the data. * * Alright, on to the implementation! */ /* We'll need this to talk to the host's terminal: */ #include /* Real machines have a "word size", which is the native size of values the * CPU operates on. For our VM here, we'll use a 16-bit word, and all of our * operations will be in words. * 'typedef' is how we tell the compiler a new name for an existing type. * In this case, we are renaming: * - 'unsigned' (no negative values) * - 'short' (16 bits) * to 'Word', for simplicity's sake. */ typedef unsigned short Word; /* The maximum amount of memory our VM has; for simplicity's sake, this is the * same as the number of possible words. */ #define MAXMEM 65536 /* Our VM has some memory to store data, a list of instructions to apply to * that data, and a "program counter", which keeps track of where the CPU is in the * list of instructions. In this VM, the instructions are stored somewhere separate, * and in fact we'll hard-code them in these examples. The instructions are * represented as words, too - this will be convenient later on, when we want * to have instructions that generate or modify new instructions. * Here, we are defining a new data type, 'Vm', that consists of those three values. */ typedef struct { Word memory[MAXMEM]; Word *instructions; Word pc; /* Whether the VM has stopped running or not - we'll need this to * allow VM programs to terminate themselves. */ int halted; } Vm; /* In this VM, each instruction will be one or more words long, and the first * word of any instruction will always be the "opcode", which identifies what * instruction it is. This is pretty wasteful of space - most of the bits of * the opcode aren't going to be used - and a real machine would pack the * instruction together more compactly. * * Anyway, we'll start with just four opcodes: ADD, CONSTANT, HALT, and OUT. */ typedef enum { OPC_ADD = 0x00, OPC_CONSTANT = 0x01, OPC_HALT = 0x02, OPC_OUT = 0x03, } Opcode; /* So, we have some opcodes, but we need a function that actually *does* * something with those opcodes. That function is basically simulating the VM's * CPU. We'll call it step(), and say that it executes the next instruction * in the VM's instruction list and updates the VM's state appropriately. * * To do that, we just need to fetch the next instruction, then figure out what * to do based on what it is. */ /* Some helper functions we're about to need, to keep the body of step() * simple... */ Word fetch_instruction(Vm *vm); void op_add(Vm *vm); void op_constant(Vm *vm); void op_halt(Vm *vm); void op_out(Vm *vm); void step(Vm *vm) { Word opcode = fetch_instruction(vm); /* This could be done with a table of function pointers instead, but * the switch is a bit easier to follow. */ switch (opcode) { case OPC_ADD: op_add(vm); break; case OPC_CONSTANT: op_constant(vm); break; case OPC_HALT: op_halt(vm); break; case OPC_OUT: op_out(vm); break; } } /* To fetch an instruction from the instruction sequence, all we do is use the * program counter as an index, then increment the program counter afterward - * the pc always points at the next instruction we're about to execute. */ Word fetch_instruction(Vm *vm) { Word in = vm->instructions[vm->pc]; vm->pc++; return in; } /* Our "add" operation will take two arguments, both addresses in the VM's * memory, and add the contents of the second to the contents of the first. * For example, ADD 12 34 would add the contents of memory word 34 to memory * word 12. */ void op_add(Vm *vm) { Word dest_addr = fetch_instruction(vm); Word source_addr = fetch_instruction(vm); /* remember, the program counter will have incremented */ vm->memory[dest_addr] += vm->memory[source_addr]; } /* Our "constant" operation also takes two arguments. The first is a memory * address, and the second is an actual value. It stores the given value at the * given address. */ void op_constant(Vm *vm) { Word dest_addr = fetch_instruction(vm); Word value = fetch_instruction(vm); vm->memory[dest_addr] = value; } /* Our "halt" operation takes no arguments at all, and stops execution of the * VM by setting the halted flag on it. */ void op_halt(Vm *vm) { vm->halted = 1; } /* Our "out" operation takes one argument: a memory address. It emits the * value at that memory address on the host's terminal, as a character. */ void op_out(Vm *vm) { Word source_addr = fetch_instruction(vm); putchar(vm->memory[source_addr]); } /* Alright! We have our step function, which simulates executing one * instruction, implemented. There's just a couple more things we need. The * first is a run function, which will just execute instructions repeatedly * until the VM program halts: */ void run(Vm *vm) { while (!vm->halted) step(vm); } /* And the second is a "boot" function, which initializes the VM - like a * real machine, a freshly allocated VM is full of uninitialized memory. Let's * zero that out and start the program counter at the first instruction. */ void zero(Word *memory, int nwords); void boot(Vm *vm, Word *program) { zero(vm->memory, MAXMEM); vm->instructions = program; vm->pc = 0; vm->halted = 0; } void zero(Word *memory, int nwords) { for (int i = 0; i < nwords; i++) memory[i] = 0; } /* Now we just need our main() function to make this a complete C program! * The main() function is going to boot up the VM with a hardcoded program and * run it. */ Word sample_program[]; Vm hello_vm; int main() { boot(&hello_vm, sample_program); run(&hello_vm); return 0; } /* This program emits the string "aceg" with a newline, then halts. */ Word sample_program[] = { /* Fill in the first two memory addresses with the character 'a' and * the value 2, which we're going to use as an increment. */ OPC_CONSTANT, 0, 'a', OPC_CONSTANT, 1, 2, /* Emit 'a' from location 0, then add 2 (from location 1) to it */ OPC_OUT, 0, OPC_ADD, 0, 1, /* Emit 'c' and add 2 */ OPC_OUT, 0, OPC_ADD, 0, 1, /* Emit 'e' and add 2 */ OPC_OUT, 0, OPC_ADD, 0, 1, /* Emit 'g' */ OPC_OUT, 0, /* Overwrite location 0 with a newline and emit it */ OPC_CONSTANT, 0, '\n', OPC_OUT, 0, /* Stop the program! */ OPC_HALT, }; /* And there's our virtual machine! The instructions we have here aren't * particularly capable, but we can add a lot more: ones that take input, ones * that move the program counter around to implement conditionals and looping, * or even ones that provide access to other parts of the host's power. * * Of course, you might be asking: why? why not just run a program directly on * the host? Three reasons: * 1. The program might have been written for a different machine (like * emulating an old game console) * 2. You might wish to write programs that run on different types of * machines, by always targeting a virtual machine that every platform has * (like Java) * 3. You might want to constrain the power the program has over the host - * for example, in our code above we could easily have imposed rules like * "the program can only run 100 instructions" or "the program has only * 1kB of memory" to limit the damage that buggy or malicious programs can * do. * * Stay tuned for part 1, which will talk about instruction encoding and * register machines! */