diff --git a/readme.org b/readme.org
index 7b0bfd16e2ff04cbc2f813c324eefb01bf499a0d..61f1ac2790563461c65852b9dbb16457f375391d 100644
--- a/readme.org
+++ b/readme.org
@@ -2,7 +2,7 @@
 
 * Overview
 
-This handles communication between the [[https://github.com/vnegnev/marcos_client][MaRCoS client]] and the MaRCoS [[https://github.com/vnegnev/flocra][FPGA firmware]] . For more information and a getting-started guide for the MaRCoS system as a whole, please see the [[https://github.com/vnegnev/marcos_extras/wiki][MaRCoS wiki]].
+This handles communication between the [[https://github.com/vnegnev/marcos_client][MaRCoS client]] and the MaRCoS [[https://github.com/vnegnev/marga][FPGA firmware]]. For more information and a getting-started guide for the MaRCoS system as a whole, please see the [[https://github.com/vnegnev/marcos_extras/wiki][MaRCoS wiki]].
 
 Utilises the [[https://github.com/ludocode/mpack][MPack]] C library for server-client communications via the msgpack protocol.
 
diff --git a/src/hardware.cpp b/src/hardware.cpp
index c83e7c06312ffb7d6c7948d17600ede6f1a564a8..03fe92ab612c52a51dd8fe09e55d317d9d64dc56 100644
--- a/src/hardware.cpp
+++ b/src/hardware.cpp
@@ -8,8 +8,8 @@
 #include <sys/mman.h>
 
 #ifdef VERILATOR_BUILD
-#include "flocra_model.hpp"
-extern flocra_model *fm;
+#include "marga_model.hpp"
+extern marga_model *mm;
 #endif
 
 // variadic macro for debugging enable/disable
@@ -44,7 +44,7 @@ int hardware::run_request(server_action &sa) {
 		halt_and_reset();
 
 		auto exec = rd32(_exec);
-		bool halted = (exec >> 24) == FLO_STATE_IDLE;
+		bool halted = (exec >> 24) == MAR_STATE_IDLE;
 		mpack_write(wr, halted);
 	}
 
@@ -96,7 +96,7 @@ int hardware::run_request(server_action &sa) {
 	auto regidx = sa.get_command_and_start_reply("regrd", status);
 	if (status == 1) {
 		++commands_understood;
-		mpack_write(wr, rd32(_flo_base + mpack_node_u32(regidx) ));
+		mpack_write(wr, rd32(_mar_base + mpack_node_u32(regidx) ));
 	}
 
 	// Read all registers
@@ -116,21 +116,21 @@ int hardware::run_request(server_action &sa) {
 		mpack_finish_array(wr);
 	}
 
-	// Fill in flocra execution memory
+	// Fill in marga execution memory
 	// TODO: add an input offset too, to avoid having to overwrite everything every time
-	auto fm = sa.get_command_and_start_reply("flo_mem", status);
+	auto mm = sa.get_command_and_start_reply("mar_mem", status);
 	if (status == 1) {
 		++commands_understood;
 		char t[100];
 
 		// uint32_t ro = *(uint32_t *)(GRAD_CTRL_REG_OFFSET);
-		if ( mpack_node_bin_size(fm) <= FLOCRA_MEM_SIZE ) {
-			size_t bytes_copied = hw_mpack_node_copy_data(fm, _flo_mem, FLOCRA_MEM_SIZE);
-			sprintf(t, "flo mem data bytes copied: %zu", bytes_copied);
+		if ( mpack_node_bin_size(mm) <= MARGA_MEM_SIZE ) {
+			size_t bytes_copied = hw_mpack_node_copy_data(mm, _mar_mem, MARGA_MEM_SIZE);
+			sprintf(t, "mar mem data bytes copied: %zu", bytes_copied);
 			sa.add_info(t);
 			mpack_write(wr, c_ok);
 		} else {
-			sprintf(t, "too much flo mem data: %zu bytes > %d -- streaming not yet implemented", mpack_node_bin_size(fm), FLOCRA_MEM_SIZE);
+			sprintf(t, "too much mar mem data: %zu bytes > %d -- streaming not yet implemented", mpack_node_bin_size(mm), MARGA_MEM_SIZE);
 			sa.add_error(t);
 			mpack_write(wr, c_err);
 		}
@@ -200,8 +200,8 @@ int hardware::run_request(server_action &sa) {
 
 		// ensure that nothing is currently running; halt if it is
 		auto exec = rd32(_exec);
-		if ( (exec >> 24) != FLO_STATE_IDLE) {
-			sprintf(t, "flo FSM was not idle when the run began");
+		if ( (exec >> 24) != MAR_STATE_IDLE) {
+			sprintf(t, "mar FSM was not idle when the run began");
 			sa.add_warning(t);
 
 			// halt FSM
@@ -212,20 +212,20 @@ int hardware::run_request(server_action &sa) {
 		const char *rundata = (char *)mpack_node_bin_data(runs);
 		size_t mem_offset = 0;
 
-		// initially fill the flocra memory
-		if (total_bytes_to_copy <= FLOCRA_MEM_SIZE) {
-			hw_memcpy(_flo_mem, rundata, total_bytes_to_copy);
+		// initially fill the marga memory
+		if (total_bytes_to_copy <= MARGA_MEM_SIZE) {
+			hw_memcpy(_mar_mem, rundata, total_bytes_to_copy);
 			mem_offset += total_bytes_to_copy;
 		} else {
 			// first time, wrap around back to the start again,
-			// so don't need to update flo memory ptr
-			hw_memcpy(_flo_mem, rundata, FLOCRA_MEM_SIZE);
-			mem_offset += FLOCRA_MEM_SIZE;
+			// so don't need to update marga memory ptr
+			hw_memcpy(_mar_mem, rundata, MARGA_MEM_SIZE);
+			mem_offset += MARGA_MEM_SIZE;
 		}
 
 		// prepare main FSM control loop
 
-		// max bytes to copy into _flo_mem at a time (would block for this long)
+		// max bytes to copy into _mar_mem at a time (would block for this long)
 		const unsigned max_bytes_to_copy = 128;
 
 		// check execution state for issues periodically
@@ -268,7 +268,7 @@ int hardware::run_request(server_action &sa) {
 				// go back a few instructions when
 				// waiting/pausing
 				debug_printf("PC wrapped\n");
-				pc_offset += FLOCRA_MEM_SIZE;
+				pc_offset += MARGA_MEM_SIZE;
 			}
 			old_pc_hw = pc_hw;
 
@@ -283,7 +283,7 @@ int hardware::run_request(server_action &sa) {
 				//
 				// -16 to keep a buffer zone of 4 un-copied
 				// instructions before PC location
-				bytes_to_copy = pc - mem_offset + FLOCRA_MEM_SIZE - 16;
+				bytes_to_copy = pc - mem_offset + MARGA_MEM_SIZE - 16;
 				if (bytes_to_copy > (int)total_bytes_remaining) bytes_to_copy = total_bytes_remaining;
 			}
 
@@ -295,7 +295,7 @@ int hardware::run_request(server_action &sa) {
 					mem_buffer_underrun = true;
 					debug_printf("mem buf underrun\n");
 					break;
-				} else if (mem_offset - pc < FLOCRA_MEM_SIZE/4) {
+				} else if (mem_offset - pc < MARGA_MEM_SIZE/4) {
 					// memory reserve only 1/4 full
 					++mem_buffer_low;
 					debug_printf("mem buf low\n");
@@ -306,22 +306,22 @@ int hardware::run_request(server_action &sa) {
 					bytes_to_copy = max_bytes_to_copy;
 				}
 
-				size_t local_mem_offset = mem_offset & FLOCRA_MEM_MASK;
+				size_t local_mem_offset = mem_offset & MARGA_MEM_MASK;
 
 				// check whether this copy will wrap
-				if ( local_mem_offset + bytes_to_copy > FLOCRA_MEM_SIZE) { // wrapping: copy in two parts
-					int first_bytes = FLOCRA_MEM_SIZE - local_mem_offset;
+				if ( local_mem_offset + bytes_to_copy > MARGA_MEM_SIZE) { // wrapping: copy in two parts
+					int first_bytes = MARGA_MEM_SIZE - local_mem_offset;
 					int second_bytes = bytes_to_copy - first_bytes;
-					hw_memcpy(_flo_mem + local_mem_offset,
+					hw_memcpy(_mar_mem + local_mem_offset,
 					          rundata + mem_offset, first_bytes);
 					mem_offset += first_bytes;
-					hw_memcpy(_flo_mem, rundata + mem_offset, second_bytes);
+					hw_memcpy(_mar_mem, rundata + mem_offset, second_bytes);
 					mem_offset += second_bytes;
 				} else { // no wrapping
 					debug_printf("hw_memcpy: %zu, %zu, %u\n", local_mem_offset, mem_offset, bytes_to_copy);
-					auto k = (char *)hw_memcpy(_flo_mem + local_mem_offset,
+					auto k = (char *)hw_memcpy(_mar_mem + local_mem_offset,
 					          rundata + mem_offset, bytes_to_copy);
-					debug_printf("bytes copied: %zu\n", k - _flo_mem - local_mem_offset);
+					debug_printf("bytes copied: %zu\n", k - _mar_mem - local_mem_offset);
 					mem_offset += bytes_to_copy;
 				}
 			}
@@ -329,16 +329,16 @@ int hardware::run_request(server_action &sa) {
 			// Read out RX
 			unsigned rx_locs = read_rx(rx0_i, rx0_q, rx1_i, rx1_q, rx_reads_per_loop);
 			// Simple dynamic FIFO read-out speed governor
-			if (rx_locs > FLOCRA_RX_FIFO_SPACE - 2*_max_rx_reads_per_loop) {
+			if (rx_locs > MARGA_RX_FIFO_SPACE - 2*_max_rx_reads_per_loop) {
 				rx_full = true;
 				rx_full_mem_loc = old_pc;
 				// desperately try to clear the FIFOs
 				rx_reads_per_loop = _max_rx_reads_per_loop;
-			} else if (rx_locs > 2 * FLOCRA_RX_FIFO_SPACE / 3) {
+			} else if (rx_locs > 2 * MARGA_RX_FIFO_SPACE / 3) {
 				++rx_nearly_full;
 				// try hard to clear the FIFOs; scale up quickly
 				if (rx_reads_per_loop < _max_rx_reads_per_loop) rx_reads_per_loop = rx_reads_per_loop * 4;
-			} else if (rx_locs > FLOCRA_RX_FIFO_SPACE / 3) {
+			} else if (rx_locs > MARGA_RX_FIFO_SPACE / 3) {
 				if (rx_reads_per_loop < _max_rx_reads_per_loop) rx_reads_per_loop = rx_reads_per_loop * 2; // scale up
 			} else {
 				if (rx_reads_per_loop > _min_rx_reads_per_loop) rx_reads_per_loop = rx_reads_per_loop / 2;
@@ -356,7 +356,7 @@ int hardware::run_request(server_action &sa) {
 				fhdo_err = fhdo_err | (status_latch & 0x4);
 			}
 
-			if (state == FLO_STATE_HALT) {
+			if (state == MAR_STATE_HALT) {
 				finished = true;
 			}
 			old_pc = pc;
@@ -428,7 +428,7 @@ int hardware::run_request(server_action &sa) {
 
 		// TODO make sure all the buffers and RX FIFOs are empty
 		unsigned buf_empty = rd32(_buf_empty);
-		if ( buf_empty != FLO_BUF_ALL_EMPTY ) {
+		if ( buf_empty != MAR_BUF_ALL_EMPTY ) {
 			sprintf(t, "output buffers were not empty at the end of sequence: 0x%08x", buf_empty);
 			sa.add_warning(t);
 		}
@@ -437,7 +437,7 @@ int hardware::run_request(server_action &sa) {
 		unsigned gpa_idle_tries = 0;
 		bool gpas_idle = false;
 		while (gpa_idle_tries < _gpa_idle_tries_limit) {
-			gpas_idle = (rd32(_status) & FLO_STATUS_GPA_MASK) == 0;
+			gpas_idle = (rd32(_status) & MAR_STATUS_GPA_MASK) == 0;
 			if (gpas_idle) break;
 			++gpa_idle_tries;
 		}
@@ -604,27 +604,27 @@ void hardware::init_mem() {
 	// types were used for some of these. Perhaps to allow
 	// different access widths?
 	_slcr = (uint32_t *) mmap(NULL, SLCR_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, SLCR_OFFSET);
-	_flo_base = (uint32_t *) mmap(NULL, FLOCRA_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, FLOCRA_OFFSET);
+	_mar_base = (uint32_t *) mmap(NULL, MARGA_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, MARGA_OFFSET);
 
 	// Map the control and status registers
-	_ctrl = _flo_base + 0;
+	_ctrl = _mar_base + 0;
 	// slv_reg1 for extension in the future
-	_direct = _flo_base + 2;
+	_direct = _mar_base + 2;
 	// slv_reg3 for extension in the future
-	_exec = _flo_base + 4; // execution information
-	_status = _flo_base + 5; // external status, ADC etc
-	_status_latch = _flo_base + 6; // latched external status
-	_buf_err = _flo_base + 7; // latched buffer errors
-	_buf_full = _flo_base + 8; // latched full buffers
-	_buf_empty = _flo_base + 9; // empty buffers
-	_rx_locs = _flo_base + 10; // RX data available
-	_rx0_i_data = _flo_base + 11; // RX0 i data
-	_rx1_i_data = _flo_base + 12; // RX1 i data
-	_rx0_q_data = _flo_base + 13; // RX0 q data
-	_rx1_q_data = _flo_base + 14; // RX1 q data
+	_exec = _mar_base + 4; // execution information
+	_status = _mar_base + 5; // external status, ADC etc
+	_status_latch = _mar_base + 6; // latched external status
+	_buf_err = _mar_base + 7; // latched buffer errors
+	_buf_full = _mar_base + 8; // latched full buffers
+	_buf_empty = _mar_base + 9; // empty buffers
+	_rx_locs = _mar_base + 10; // RX data available
+	_rx0_i_data = _mar_base + 11; // RX0 i data
+	_rx1_i_data = _mar_base + 12; // RX1 i data
+	_rx0_q_data = _mar_base + 13; // RX0 q data
+	_rx1_q_data = _mar_base + 14; // RX1 q data
 
 	// /2 since mem is halfway in address space, /4 to convert to 32-bit instead of byte addressing
-	_flo_mem = reinterpret_cast<volatile char *>(_flo_base + FLOCRA_SIZE/2/4);
+	_mar_mem = reinterpret_cast<volatile char *>(_mar_base + MARGA_SIZE/2/4);
 }
 
 void hardware::halt() {
@@ -639,7 +639,7 @@ void hardware::halt() {
 	// Wait a while for all the output buffers to empty
 	unsigned k = 0;
 	while (k < _halt_tries_limit) {
-		if ( rd32(_buf_empty) == FLO_BUF_ALL_EMPTY ) break;
+		if ( rd32(_buf_empty) == MAR_BUF_ALL_EMPTY ) break;
 		++k;
 	}
 
@@ -649,7 +649,7 @@ void hardware::halt() {
 		read_rx(rx0_i, rx0_q, rx1_i, rx1_q);
 	}
 
-	while ( (rd32(_exec) >> 24 == FLO_STATE_COUNTDOWN) && k < _halt_tries_limit ) {
+	while ( (rd32(_exec) >> 24 == MAR_STATE_COUNTDOWN) && k < _halt_tries_limit ) {
 		++k;
 	}
 	wr32(_ctrl, 0x0); // set FSM to idle (not explicitly halted)
@@ -724,13 +724,13 @@ unsigned hardware::read_rx(std::vector<uint32_t> &rx0_i, std::vector<uint32_t> &
 
 void hardware::wr32(volatile uint32_t *addr, uint32_t data) {
 #ifdef VERILATOR_BUILD
-	if (addr >= _flo_base && addr < _flo_base + FLOCRA_SIZE) {
+	if (addr >= _mar_base && addr < _mar_base + MARGA_SIZE) {
 		// do byte-address arithmetic
 		auto offs_addr = reinterpret_cast<volatile char *>(addr)
-			- reinterpret_cast<volatile char *>(_flo_base);
-		// printf("addresses 0x%0lx, 0x%0lx\n", addr, _flo_base);
-		// printf("write flo 0x%0lx, 0x%08x\n", offs_addr, data);
-		fm->wr32(offs_addr, data); // convert to byte addressing
+			- reinterpret_cast<volatile char *>(_mar_base);
+		// printf("addresses 0x%0lx, 0x%0lx\n", addr, _mar_base);
+		// printf("write mar 0x%0lx, 0x%08x\n", offs_addr, data);
+		mm->wr32(offs_addr, data); // convert to byte addressing
 	} else {
 		printf("write addr 0x%0lx, 0x%08x NOT SIMULATED\n", (size_t) addr, data);
 	}
@@ -741,12 +741,12 @@ void hardware::wr32(volatile uint32_t *addr, uint32_t data) {
 
 uint32_t hardware::rd32(volatile uint32_t *addr) {
 #ifdef VERILATOR_BUILD
-	if (addr >= _flo_base && addr < _flo_base + FLOCRA_SIZE) {
+	if (addr >= _mar_base && addr < _mar_base + MARGA_SIZE) {
 		// do byte-address arithmetic
 		auto offs_addr = reinterpret_cast<volatile char *>(addr)
-			- reinterpret_cast<volatile char *>(_flo_base);
-		// printf("read flo 0x%0lx\n", offs_addr);
-		return fm->rd32(offs_addr); // convert to byte addressing
+			- reinterpret_cast<volatile char *>(_mar_base);
+		// printf("read mar 0x%0lx\n", offs_addr);
+		return mm->rd32(offs_addr); // convert to byte addressing
 	} else {
 		printf("read addr 0x%0lx NOT SIMULATED\n", (size_t) addr);
 		return 0;
diff --git a/src/hardware.hpp b/src/hardware.hpp
index e1822401d3832f313c4a04a02fa212a94e96752f..6533ed9d9afb1665fa75ca6b491e2f2d97ff43cb 100644
--- a/src/hardware.hpp
+++ b/src/hardware.hpp
@@ -12,20 +12,20 @@
 // Memory-mapped device sizes
 static const unsigned PAGESIZE = sysconf(_SC_PAGESIZE); // should be 4096 (4KiB) on both x86_64 and ARM
 static const unsigned SLCR_SIZE = PAGESIZE,
-	FLOCRA_SIZE = 128*PAGESIZE,
-	FLOCRA_MEM_SIZE = 64*PAGESIZE;
-static const unsigned FLOCRA_MEM_MASK = 0x3ffff;
-static const unsigned FLOCRA_RX_FIFO_SPACE = 16384;
+	MARGA_SIZE = 128*PAGESIZE,
+	MARGA_MEM_SIZE = 64*PAGESIZE;
+static const unsigned MARGA_MEM_MASK = 0x3ffff;
+static const unsigned MARGA_RX_FIFO_SPACE = 16384;
 
-// flocra internal states
-static const unsigned FLO_STATE_IDLE = 0, FLO_STATE_PREPARE = 1, FLO_STATE_RUN = 2,
-	FLO_STATE_COUNTDOWN = 3, FLO_STATE_TRIG = 4, FLO_STATE_TRIG_FOREVER = 5,
-	FLO_STATE_HALT = 8;
+// marga internal states
+static const unsigned MAR_STATE_IDLE = 0, MAR_STATE_PREPARE = 1, MAR_STATE_RUN = 2,
+	MAR_STATE_COUNTDOWN = 3, MAR_STATE_TRIG = 4, MAR_STATE_TRIG_FOREVER = 5,
+	MAR_STATE_HALT = 8;
 
-static const unsigned FLO_BUFS = 24;
-static const unsigned FLO_BUF_ALL_EMPTY = (1 << FLO_BUFS) - 1;
+static const unsigned MAR_BUFS = 24;
+static const unsigned MAR_BUF_ALL_EMPTY = (1 << MAR_BUFS) - 1;
 
-static const unsigned FLO_STATUS_GPA_MASK = 0x00030000;
+static const unsigned MAR_STATUS_GPA_MASK = 0x00030000;
 
 struct mpack_node_t;
 
@@ -62,11 +62,11 @@ private:
 	unsigned _max_rx_reads_per_loop = 1024;
 
 	// Peripheral register addresses in PL
-	volatile uint32_t *_slcr, *_flo_base, *_ctrl, *_direct, *_exec, *_status,
+	volatile uint32_t *_slcr, *_mar_base, *_ctrl, *_direct, *_exec, *_status,
 		*_status_latch, *_buf_err, *_buf_full, *_buf_empty, *_rx_locs,
 		*_rx0_i_data, *_rx1_i_data, *_rx0_q_data, *_rx1_q_data;
 
-	volatile char *_flo_mem;
+	volatile char *_mar_mem;
 
 	/// @brief Write
 
diff --git a/src/marcos_server.cpp b/src/marcos_server.cpp
index 098b4c200d13b87323e940a33b9835dcaf28e813..289f30440d9503183e8cd42f8f78da97967eb5e5 100644
--- a/src/marcos_server.cpp
+++ b/src/marcos_server.cpp
@@ -1,10 +1,10 @@
 /* Top-level MaRCoS server file.
  *
  * When compiling Verilator simulation, replace this with
- * flocra_sim_main.cpp in the flocra library.
+ * marga_sim_main.cpp in the marga library.
  *
  * Overall operation should remain compatible between both, apart from
- * the extra Verilator-related objects in flocra_sim_main which are
+ * the extra Verilator-related objects in marga_sim_main which are
  * made use of in hardware.cpp for emulating PS<->PL communication.
  *
  */
diff --git a/src/server_config.h b/src/server_config.h
index 0f5df48976cdd2fb186df332fd4c43e470dccc9b..cb45a8421a3f0e1674096a405ab1194851d29741 100644
--- a/src/server_config.h
+++ b/src/server_config.h
@@ -6,15 +6,15 @@
 // Memory map
 #ifdef __arm__
 static const uint32_t SLCR_OFFSET = 0xf8000000,
-	FLOCRA_OFFSET = 0x43c00000, // ocra_grad_ctrl core's main offset (TODO: fill this in with Vivado value once it's ready)
-	FLOCRA_MEM_OFFSET = FLOCRA_OFFSET + 262144; // 256KiB offset inside the core to access the BRAMs
+	MARGA_OFFSET = 0x43c00000, // ocra_grad_ctrl core's main offset (TODO: fill this in with Vivado value once it's ready)
+	MARGA_MEM_OFFSET = MARGA_OFFSET + 262144; // 256KiB offset inside the core to access the BRAMs
 
 #else // emulate the memory in a single file on the desktop, for the purposes of debugging, emulation etc
 static const unsigned EMU_PAGESIZE = 0x1000; // 4 KiB, page size on the RP and my desktop machine
 static const uint32_t SLCR_OFFSET = 0, // 1 page in size
-	FLOCRA_OFFSET = SLCR_OFFSET + EMU_PAGESIZE,
-	FLOCRA_MEM_OFFSET = FLOCRA_OFFSET + 64 * EMU_PAGESIZE,
-	END_OFFSET = FLOCRA_MEM_OFFSET + 64 * EMU_PAGESIZE;
+	MARGA_OFFSET = SLCR_OFFSET + EMU_PAGESIZE,
+	MARGA_MEM_OFFSET = MARGA_OFFSET + 64 * EMU_PAGESIZE,
+	END_OFFSET = MARGA_MEM_OFFSET + 64 * EMU_PAGESIZE;
 #endif
 
 // Auxiliary parameters