Bitstream decompression & initial support for MachXO2

MachXO2 support helped by some earlier work by cr1901 in figuring
out some bitstream differences (row/column swapped, 1 based indexing
for columns, frame order reversed, etc) and writing the first fuzzers

Signed-off-by: Andres Navarro <canavarro82@gmail.com>
diff --git a/devices.json b/devices.json
index a2095be..ac5f989 100644
--- a/devices.json
+++ b/devices.json
@@ -111,6 +111,47 @@
                 "fuzz": 1
             }
         }
+    },
+    "MachXO2" : {
+      "devices" : {
+            "LCMXO2-256HC": {
+                "packages": ["QFN32"],
+                "idcode": "0x012b8043",
+                "frames": 186,
+                "bits_per_frame": 504,
+                "pad_bits_after_frame": 0,
+                "pad_bits_before_frame": 0,
+                "max_row" : 7,
+                "max_col" : 10,
+                "col_bias" : 1,
+                "fuzz": 1
+            },
+
+            "LCMXO2-1200HC": {
+                "packages": ["QFN32"],
+                "idcode": "0x012ba043",
+                "frames": 333,
+                "bits_per_frame": 1080,
+                "pad_bits_after_frame": 0,
+                "pad_bits_before_frame": 0,
+                "max_row" : 12,
+                "max_col" : 22,
+                "col_bias" : 1,
+                "fuzz": 1
+            },
+            "LCMXO2-4000HC": {
+                "packages": ["TQFP144"],
+                "idcode": "0x012bc043",
+                "frames": 623,
+                "bits_per_frame": 1560,
+                "pad_bits_after_frame": 0,
+                "pad_bits_before_frame": 0,
+                "max_row" : 22,
+                "max_col" : 32,
+                "col_bias" : 1,
+                "fuzz": 1
+            }
+        }
     }
   }
 }
diff --git a/diamond.sh b/diamond.sh
index b23aff5..24d7339 100755
--- a/diamond.sh
+++ b/diamond.sh
@@ -92,11 +92,28 @@
 		LSE_ARCH="ECP5UM5G"
 		;;
 
+	LCMXO2-256HC)
+		PACKAGE="${DEV_PACKAGE:-QFN32}"
+		DEVICE="LCMXO2-256HC"
+		LSE_ARCH="MachXO2"
+		;;
+
+	LCMXO2-1200HC)
+		PACKAGE="${DEV_PACKAGE:-QFN32}"
+		DEVICE="LCMXO2-1200HC"
+		LSE_ARCH="MachXO2"
+		;;
+
 	LCMXO2-2000HC)
 		PACKAGE="${DEV_PACKAGE:-TQFP100}"
 		DEVICE="LCMXO2-2000HC"
 		LSE_ARCH="MachXO2"
 		;;
+	LCMXO2-4000HC)
+		PACKAGE="${DEV_PACKAGE:-TQFP144}"
+		DEVICE="LCMXO2-4000HC"
+		LSE_ARCH="MachXO2"
+		;;
 	LCMXO2-7000HC)
 		PACKAGE="${DEV_PACKAGE:-TQFP144}"
 		DEVICE="LCMXO2-7000HC"
diff --git a/docs/architecture/bitstream_format.rst b/docs/architecture/bitstream_format.rst
index 273dfd5..3b7234d 100644
--- a/docs/architecture/bitstream_format.rst
+++ b/docs/architecture/bitstream_format.rst
@@ -22,7 +22,7 @@
 
 The CRC16 is accumulated over all commands until a CRC16 check is reached. It is not reset at the end of commands
 without a CRC16 check - except the ``LSC_RESET_CRC`` command, and after the actual bitstream payload
-(``LSC_PROG_INCR_RTI``).
+(``LSC_PROG_INCR_RTI`` or ``LSC_PROG_INCR_CMP``).
 
 The CRC16 is calculated using the polynomial 0x8005 with no bit reversal. This algorithm is sometimes known as
 "CRC16-BUYPASS".
@@ -45,6 +45,11 @@
 | ``VERIFY_ID``                 | E2  | - 24 bit info: all 0     | This checks the actual device ID against the given|
 |                               |     | - 32 bit device JTAG ID  | value and fails if they do not match.             |
 +-------------------------------+-----+--------------------------+---------------------------------------------------+
+| ``LSC_WRITE_COMP_DIC``        | 02  | - 24 bit info: all 0     | This stores the 8 most common bytes in the frames |
+|                               |     | - 8 bit Pattern7         |                                                   |
+|                               |     | - ... (6 more patterns)  |                                                   |
+|                               |     | - 8 bit Pattern0         |                                                   |
++-------------------------------+-----+--------------------------+---------------------------------------------------+
 | ``LSC_PROG_CNTRL0``           | 22  | - 24 bit info: all 0     | This sets the value of device control register 0  |
 |                               |     | - 32 bit CtlReg0 value   | Normally 0x40000000                               |
 +-------------------------------+-----+--------------------------+---------------------------------------------------+
@@ -60,7 +65,8 @@
 
 Configuration Data
 -------------------
-The FPGA configuration data itself is programmed by using command ``LSC_PROG_INCR_RTI`` (0x82). Following this command,
+The FPGA configuration data itself is programmed by using command ``LSC_PROG_INCR_RTI`` (0x82) if no compression is
+used and command ``LSC_PROG_INCR_CMP`` (0xB8) when using compression. Following either of these commands,
 there are some setup bits:
 
  - 1 bit: CRC16 comparison flag, normally set
@@ -72,21 +78,59 @@
 
 This is then followed by a number of frames, each in the following format:
 
- - The configuration frame itself, such that bit 0 of the first byte sent is the MSB of the frame,
-   bit 7 of the first byte the MSB-7 and bit 0 of the last byte (if there are no dummy bits) being the LSB of the frame.
- - Any dummy bits needed to pad the frame to a whole number of bytes
- - A CRC-16 checksum:
-
-    - For the first frame, this also covers any other commands sent
-      before the programming command but after a CRC reset, and the programming command itself.
+ - The configuration frame itself (compressed in the case of the  ``LSC_PROG_INCR_RTI`` command),
+   such that bit 0 bit 0 of the first byte is the MSB of the frame, bit 7 of the first byte the 
+   MSB-7 and bit 0 of the last byte (if there are no dummy bits) being the LSB of the frame.
+ - Any dummy bits needed to pad the frame to a whole number of bytes.
+ - If the second flag is cleared (see above) a CRC-16 checksum:
+    - For the first frame, this also covers any other commands sent before the programming command
+      but after a CRC reset, and the programming command itself.
     - For subsequent frames, this excludes dummy bytes between frames
  - Dummy 0xFF bytes, usually only 1
 
 The highest numbered frame in the chip is sent first.
 
-Separate commands are used if EBR needs to be configured in the bitstream. ``EBR_ADDRESS`` (0xF6) is used to select the
-EBR to program and the starting address in the EBR; and ``LSC_EBR_WRITE`` (0xB2) is used to program the EBR itself using
-72-bit frames. The specifics of these still need to be documented.
+If the second flag is set there's no CRC sent in between frames but there's still one CRC-16 checksum
+after all the frames (this also covers any other commands sent before the programming command but after a CRC reset, 
+and the programming command itself.).
+
+Separate commands are used if EBR needs to be configured in the bitstream. EBR data can't use compression.
+``EBR_ADDRESS`` (0xF6) is used to select the EBR to program and the starting address in the EBR;
+and ``LSC_EBR_WRITE`` (0xB2) is used to program the EBR itself using 72-bit frames. The specifics of these
+still need to be documented.
+
+Compression Algorithm
+------------------------------
+
+ - Before compression, the frame is left padded with zeroes (0) to make the data frame 64-bit bounded. 
+ - After compressing the frame data, the resulting bits are right padded with zeroes (0) to make the data
+frame byte bounded.
+
+After padding, every byte in the bitstream is compressed by a simple prefix-free code with just 4 cases:
+
++--------------+--------------+--------+---------------------+
+| Code         | Argument     | Length | Encoded byte        |
++==============+==============+========+=====================+
+| 0            |              | 1      | zero                |
++--------------+--------------+--------+---------------------+
+| 100xxx       | bit position | 6      | byte with 1 bit set |
++--------------+--------------+--------+---------------------+
+| 101xxx       | byte index   | 6      | stored byte         |
++--------------+--------------+--------+---------------------+
+| 11xxxxxxxx   | lit. byte    | 10     | all others          |
++--------------+--------------+--------+---------------------+
+
+- The first case is for the byte zero (``00000000``).  That's just represented by a single zero bit (``0``).
+- The second case is for bytes with just one bit set.  After a ``100`` the set bit position is encoded
+  in the following 3 bits.  For example the byte ``00100000`` is encoded as ``100101`` because only the
+  bit 5 is set (with 0 being the lsb and 7 the msb).
+- The third case is for selecting one of the bytes stored by the ``LSC_WRITE_COMP_DIC`` instruction.  Those
+  bytes are selected as the 8 most common bytes (ignoring the zero bytes and the bytes with just one bit set,
+  because those are encoded with the two previous cases).  After a ``101`` the number of the selected pattern
+  is encoded with 3 bits.  For example to select pattern3 the code would be ``101011``.
+- The fourth case is for all remaining bytes.  In that case after a ``11`` the complete byte is copied.  For example
+  byte ``11001010`` would be encoded as ``1111001010``.
+
 
 Device-Specific Information
 ------------------------------
@@ -105,4 +149,4 @@
 | LFE5U-85  | 0x41113043  | 13294  | 1136                  | 0                    |
 +-----------+-------------+--------+-----------------------+----------------------+
 | LFE5UM-85 | 0x01113043  | 13294  | 1136                  | 0                    |
-+-----------+-------------+--------+-----------------------+----------------------+
\ No newline at end of file
++-----------+-------------+--------+-----------------------+----------------------+
diff --git a/fuzzers/MachXO2/003-plc_lut_init/empty.ncl b/fuzzers/MachXO2/003-plc_lut_init/empty.ncl
new file mode 100644
index 0000000..cbdac0a
--- /dev/null
+++ b/fuzzers/MachXO2/003-plc_lut_init/empty.ncl
@@ -0,0 +1,12 @@
+::FROM-WRITER;
+design top
+{
+   device
+   {
+       architecture xo2c00;
+       device LCMXO2-4000HC;
+       package TQFP144;
+       performance "6";
+   }
+
+}
diff --git a/fuzzers/MachXO2/003-plc_lut_init/fuzzer.py b/fuzzers/MachXO2/003-plc_lut_init/fuzzer.py
new file mode 100644
index 0000000..2e779ba
--- /dev/null
+++ b/fuzzers/MachXO2/003-plc_lut_init/fuzzer.py
@@ -0,0 +1,46 @@
+from fuzzconfig import FuzzConfig
+import nonrouting
+import fuzzloops
+import nets
+import pytrellis
+import re
+
+cfg = FuzzConfig(job="PLCINIT", family="MachXO2", device="LCMXO2-4000HC", ncl="empty.ncl", tiles=["R9C9:PLC"])
+
+
+def get_lut_function(init_bits):
+    sop_terms = []
+    lut_inputs = ["A", "B", "C", "D"]
+    for i in range(16):
+        if init_bits[i]:
+            p_terms = []
+            for j in range(4):
+                if i & (1 << j) != 0:
+                    p_terms.append(lut_inputs[j])
+                else:
+                    p_terms.append("~" + lut_inputs[j])
+            sop_terms.append("({})".format("*".join(p_terms)))
+    if len(sop_terms) == 0:
+        lut_func = "0"
+    else:
+        lut_func = "+".join(sop_terms)
+    return lut_func
+
+
+def main():
+    pytrellis.load_database("../../../database")
+    cfg.setup()
+    empty_bitfile = cfg.build_design(cfg.ncl, {})
+    cfg.ncl = "lut.ncl"
+
+    def per_slice(slicen):
+        for k in range(2):
+            def get_substs(bits):
+                return dict(slice=slicen, k=str(k), lut_func=get_lut_function(bits))
+            nonrouting.fuzz_word_setting(cfg, "SLICE{}.K{}.INIT".format(slicen, k), 16, get_substs, empty_bitfile)
+
+    fuzzloops.parallel_foreach(["A", "B", "C", "D"], per_slice)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fuzzers/MachXO2/003-plc_lut_init/lut.ncl b/fuzzers/MachXO2/003-plc_lut_init/lut.ncl
new file mode 100644
index 0000000..2df3dc3
--- /dev/null
+++ b/fuzzers/MachXO2/003-plc_lut_init/lut.ncl
@@ -0,0 +1,25 @@
+::FROM-WRITER;
+design top
+{
+   device
+   {
+       architecture xo2c00;
+       device LCMXO2-4000HC;
+       package TQFP144;
+       performance "6";
+   }
+
+   comp SLICE_0
+   {
+      logical
+      {
+         cellmodel-name SLICE;
+         program "MODE:LOGIC "
+                 "K${k}::H${k}=${lut_func} "
+                 "F${k}:F ";
+         primitive K${k} i3_4_lut;
+      }
+      site R9C9${slice};
+   }
+
+}
diff --git a/libtrellis/include/Bitstream.hpp b/libtrellis/include/Bitstream.hpp
index ab20abd..e644f80 100644
--- a/libtrellis/include/Bitstream.hpp
+++ b/libtrellis/include/Bitstream.hpp
@@ -49,6 +49,7 @@
     static Bitstream serialise_chip(const Chip &chip, const map<string, string> options);
 
     // Deserialise a bitstream to a Chip
+    Chip deserialise_chip();
     Chip deserialise_chip(boost::optional<uint32_t> idcode = boost::optional<uint32_t>());
 
     // Write a Lattice .bit file (metadata + bitstream)
diff --git a/libtrellis/src/Bitstream.cpp b/libtrellis/src/Bitstream.cpp
index 0bc5bcf..24fadda 100644
--- a/libtrellis/src/Bitstream.cpp
+++ b/libtrellis/src/Bitstream.cpp
@@ -9,6 +9,7 @@
 #include <boost/optional.hpp>
 #include <iomanip>
 #include <fstream>
+#include <array>
 
 namespace Trellis {
 
@@ -67,6 +68,15 @@
         return val;
     }
 
+    // like get_byte but don't update CRC if it's a dummy
+    inline uint8_t get_byte_maybe_dummy() {
+        assert(iter < data.end());
+        uint8_t val = *(iter++);
+        if (val != 0xff)
+            update_crc16(val);
+        return val;
+    }
+
     // Write a single byte and update CRC
     inline void write_byte(uint8_t b) {
         data.push_back(b);
@@ -82,6 +92,83 @@
         }
     }
 
+    // Decompress and copy multiple bytes into an OutputIterator and update CRC
+    template<typename T>
+    void get_compressed_bytes(T out, size_t count, array<uint8_t, 8> compression_dict) {
+        // Here we store data already read by read_byte(), it may be more than 1 byte at times!!
+        uint16_t read_data = 0;
+        size_t remaining_bits = 0;
+        bool next_bit;
+
+        uint8_t udata;
+
+        //
+        // Every byte can be encoded by on of 4 cases
+        // It's a prefix-free code so we can identify each one just by looking at the first bits:
+        // 0 -> Byte zero (0000 0000)
+        // 100 xxx -> Stored byte in compression_dict, xxx is the index (0-7)
+        // 101 xxx -> Byte with a single bit set, xxx is the index of the set bit (0 is lsb, 7 is msb)
+        // 11 xxxxxxxx -> Literal byte, xxxxxxxx is the encoded byte
+        //
+        for (size_t i = 0; i < count; i++) {
+            // Make sure we have at least one bit in the buffer
+            if (!remaining_bits) {
+                read_data = (uint32_t) get_byte();
+                remaining_bits = 8;
+            }
+            next_bit = bool(read_data >> (remaining_bits-1) & 1);
+            remaining_bits--;
+
+            // Check the 4 cases leaving the uncompressed byte in udata
+            if (next_bit) {
+                // Starts with 1, so check next bit/bits
+                // For each of the 3 remaining cases we will need at least 5 more bits,
+                // so if we have less than that it's ok to read another byte
+                if (remaining_bits < 5) {
+                    read_data = (read_data << 8) | ((uint32_t) get_byte());
+                    remaining_bits += 8;
+                }
+                next_bit = bool(read_data >> (remaining_bits-1) & 1);
+                remaining_bits--;
+
+                if (next_bit) {
+                    // 11 xxxx xxxx: Literal byte, just read the next 8 bits & use that
+                    // we consumed 10 bits total
+                    if (remaining_bits < 8) {
+                        read_data = (read_data << 8) | ((uint32_t) get_byte());
+                        remaining_bits += 8;
+                    }
+                    udata = uint8_t((read_data >> (remaining_bits - 8)) & 0xff);
+                    remaining_bits -= 8;
+                } else {
+                    // Starts with 10, it could be a stored literal or a single-bit-set byte
+                    // 10 ? xxx: In both cases we need the index xxx, so extract it now
+                    // We already have all the bits we need buffered
+                    next_bit = bool(read_data >> (remaining_bits-1) & 1);
+                    remaining_bits--;
+                    size_t idx = (size_t) ((read_data >> (remaining_bits-3)) & 0x7);
+                    remaining_bits -= 3;
+                    if (next_bit) {
+                        // 101 xxx: Stored byte.  Just use xxx as index in the dictionary,
+                        // we consumed 6 bits
+                        udata = compression_dict[idx];
+                    } else {
+                        // 100 xxx: Single-bit-set byte, xxx is the index of the set bit
+                        // we consumed 6 bits
+                        udata = uint8_t(1 << idx);
+                    }
+                }
+            } else {
+                // 0: the uncompressed byte is zero
+                // we consumed just one bit
+                udata = 0;
+            }
+            *out = udata;
+            ++out;
+        }
+        // if remaining bits > 0 they are just padding bits added to the end so we can ignore them
+    }
+
     // Write multiple bytes from an InputIterator and update CRC
     template<typename T>
     void write_bytes(T in, size_t count) {
@@ -99,18 +186,6 @@
         for (size_t i = 0; i < count; i++) write_byte(0x00);
     }
 
-    // Skip over a possible-dummy command section of N bytes, updating CRC only if command is not 0xFF
-    uint8_t skip_possible_dummy(int size) {
-        uint8_t cmd = *(iter++);
-        if (cmd == 0xFF) {
-            iter += (size - 1);
-        } else {
-            update_crc16(cmd);
-            skip_bytes(size - 1);
-        }
-        return cmd;
-    }
-
     // Insert dummy bytes into the bitstream, without updating CRC
     void insert_dummy(size_t count) {
         for (size_t i = 0; i < count; i++)
@@ -234,11 +309,17 @@
 
 static const vector<uint8_t> preamble = {0xFF, 0xFF, 0xBD, 0xB3};
 
+Chip Bitstream::deserialise_chip() {
+    return deserialise_chip(boost::none);
+}
+
 Chip Bitstream::deserialise_chip(boost::optional<uint32_t> idcode) {
     cerr << "bitstream size: " << data.size() * 8 << " bits" << endl;
     BitstreamReadWriter rd(data);
     boost::optional<Chip> chip;
     bool found_preamble = rd.find_preamble(preamble);
+    boost::optional<array<uint8_t, 8>> compression_dict;
+
     if (!found_preamble)
         throw BitstreamParseError("preamble not found in bitstream");
 
@@ -246,7 +327,7 @@
     int addr_in_ebr = 0;
 
     while (!rd.is_end()) {
-        uint8_t cmd = rd.get_byte();
+        uint8_t cmd = rd.get_byte_maybe_dummy();
         switch ((BitstreamCommand) cmd) {
             case BitstreamCommand::LSC_RESET_CRC:
                 BITSTREAM_DEBUG("reset crc");
@@ -290,40 +371,88 @@
                     rd.check_crc16();
             }
                 break;
+            case BitstreamCommand::LSC_WRITE_COMP_DIC: {
+                bool check_crc = (rd.get_byte() & 0x80) != 0;
+                rd.skip_bytes(2);
+                compression_dict = boost::make_optional(array<uint8_t, 8>());
+                // patterns are stored in the bitstream in reverse order: pattern7 to pattern0
+                for (int i = 7; i >= 0; i--) {
+                  uint8_t pattern = rd.get_byte();
+                  compression_dict.get()[i] = pattern;
+                }
+                BITSTREAM_DEBUG("write compression dictionary: " <<
+                                "0x" << hex << setw(2) << setfill('0') << int(compression_dict.get()[0]) << " " <<
+                                "0x" << hex << setw(2) << setfill('0') << int(compression_dict.get()[1]) << " " <<
+                                "0x" << hex << setw(2) << setfill('0') << int(compression_dict.get()[2]) << " " <<
+                                "0x" << hex << setw(2) << setfill('0') << int(compression_dict.get()[3]) << " " <<
+                                "0x" << hex << setw(2) << setfill('0') << int(compression_dict.get()[4]) << " " <<
+                                "0x" << hex << setw(2) << setfill('0') << int(compression_dict.get()[5]) << " " <<
+                                "0x" << hex << setw(2) << setfill('0') << int(compression_dict.get()[6]) << " " <<
+                                "0x" << hex << setw(2) << setfill('0') << int(compression_dict.get()[7]));;
+                if (check_crc)
+                  rd.check_crc16();
+            }
+                break;
             case BitstreamCommand::LSC_INIT_ADDRESS:
                 rd.skip_bytes(3);
                 BITSTREAM_DEBUG("init address");
                 break;
+            case BitstreamCommand::LSC_PROG_INCR_CMP:
+                // This is the main bitstream payload (compressed)
+                BITSTREAM_DEBUG("Compressed bitstream found");
+                if (!compression_dict)
+                    throw BitstreamParseError("start of compressed bitstream data before compression dictionary was stored", rd.get_offset());
+                // fall through
             case BitstreamCommand::LSC_PROG_INCR_RTI: {
                 // This is the main bitstream payload
                 if (!chip)
                     throw BitstreamParseError("start of bitstream data before chip was identified", rd.get_offset());
+                bool reversed_frames;
+                if (chip->info.family == "MachXO2")
+                    reversed_frames = false;
+                else if (chip->info.family == "ECP5")
+                    reversed_frames = true;
+                else
+                    throw BitstreamParseError("Unknown chip family: " + chip->info.family);
+
                 uint8_t params[3];
                 rd.get_bytes(params, 3);
                 BITSTREAM_DEBUG("settings: " << hex << setw(2) << int(params[0]) << " " << int(params[1]) << " "
                                              << int(params[2]));
-                size_t dummy_bytes = (params[0] & 0x0FU);
+                // I've only seen 0x81 for the ecp5 and 0x8e for the xo2 so far...
+                bool check_crc = params[0] & 0x80U;
+                // inverted value: a 0 means check after every frame
+                bool crc_after_each_frame = check_crc && !(params[0] & 0x40U);
+                // I don't know what these two are for I've seen both 1s (XO2) and both 0s (ECP5)
+                // The names are from the ECP5 docs
+                // bool include_dummy_bits = params[0] & 0x20U;
+                // bool include_dummy_bytes = params[0] & 0x10U;
+                size_t dummy_bytes = params[0] & 0x0FU;
                 size_t frame_count = (params[1] << 8U) | params[2];
-                BITSTREAM_NOTE(
-                        "reading " << std::dec << frame_count << " config frames (with " << std::dec << dummy_bytes
-                                   << " dummy bytes)");
+                BITSTREAM_NOTE("reading " << std::dec << frame_count << " config frames (with " << std::dec << dummy_bytes << " dummy bytes)");
                 size_t bytes_per_frame = (chip->info.bits_per_frame + chip->info.pad_bits_after_frame +
                                           chip->info.pad_bits_before_frame) / 8U;
+                // If compressed 0 bits are added to the stream before compression to make it 64 bit bounded, so
+                // we should consider that space here but they shouldn't be copied to the output
+                if ((BitstreamCommand) cmd == BitstreamCommand::LSC_PROG_INCR_CMP)
+                    bytes_per_frame += (7 - ((bytes_per_frame - 1) % 8));
                 unique_ptr<uint8_t[]> frame_bytes = make_unique<uint8_t[]>(bytes_per_frame);
                 for (size_t i = 0; i < frame_count; i++) {
-                    rd.get_bytes(frame_bytes.get(), bytes_per_frame);
+                    size_t idx = reversed_frames? (chip->info.num_frames - 1) - i : i;
+                    if ((BitstreamCommand) cmd == BitstreamCommand::LSC_PROG_INCR_CMP)
+                        rd.get_compressed_bytes(frame_bytes.get(), bytes_per_frame, compression_dict.get());
+                    else
+                        rd.get_bytes(frame_bytes.get(), bytes_per_frame);
+
                     for (int j = 0; j < chip->info.bits_per_frame; j++) {
                         size_t ofs = j + chip->info.pad_bits_after_frame;
-                        chip->cram.bit((chip->info.num_frames - 1) - i, j) = (char) (
-                                (frame_bytes[(bytes_per_frame - 1) - (ofs / 8)] >> (ofs % 8)) & 0x01);
+                        chip->cram.bit(idx, j) = (char)
+                            ((frame_bytes[(bytes_per_frame - 1) - (ofs / 8)] >> (ofs % 8)) & 0x01);
                     }
-                    rd.check_crc16();
+                    if (crc_after_each_frame || (check_crc && (i == frame_count-1)))
+                      rd.check_crc16();
                     rd.skip_bytes(dummy_bytes);
                 }
-                // Post-bitstream space for SECURITY and SED
-                // TODO: process SECURITY and SED
-                rd.skip_possible_dummy(8);
-                rd.skip_possible_dummy(4);
             }
                 break;
             case BitstreamCommand::LSC_EBR_ADDRESS: {
diff --git a/libtrellis/src/PyTrellis.cpp b/libtrellis/src/PyTrellis.cpp
index 2d58620..52e0fbf 100644
--- a/libtrellis/src/PyTrellis.cpp
+++ b/libtrellis/src/PyTrellis.cpp
@@ -63,7 +63,7 @@
             .def("write_bit", &Bitstream::write_bit_py)
             .def_readwrite("metadata", &Bitstream::metadata)
             .def_readwrite("data", &Bitstream::data)
-            .def("deserialise_chip", &Bitstream::deserialise_chip);
+        .def("deserialise_chip", static_cast<Chip (Bitstream::*)()>(&Bitstream::deserialise_chip));
 
     class_<DeviceLocator>("DeviceLocator")
             .def_readwrite("family", &DeviceLocator::family)
diff --git a/metadata/MachXO2/LCMXO2-1200HC/globals.json b/metadata/MachXO2/LCMXO2-1200HC/globals.json
new file mode 100644
index 0000000..a408e0b
--- /dev/null
+++ b/metadata/MachXO2/LCMXO2-1200HC/globals.json
@@ -0,0 +1,8 @@
+{
+  "quadrants": {
+  },
+  "taps": {
+  },
+  "spines": {
+  }
+}
diff --git a/metadata/MachXO2/LCMXO2-256HC/globals.json b/metadata/MachXO2/LCMXO2-256HC/globals.json
new file mode 100644
index 0000000..a408e0b
--- /dev/null
+++ b/metadata/MachXO2/LCMXO2-256HC/globals.json
@@ -0,0 +1,8 @@
+{
+  "quadrants": {
+  },
+  "taps": {
+  },
+  "spines": {
+  }
+}
diff --git a/metadata/MachXO2/LCMXO2-4000HC/globals.json b/metadata/MachXO2/LCMXO2-4000HC/globals.json
new file mode 100644
index 0000000..a408e0b
--- /dev/null
+++ b/metadata/MachXO2/LCMXO2-4000HC/globals.json
@@ -0,0 +1,8 @@
+{
+  "quadrants": {
+  },
+  "taps": {
+  },
+  "spines": {
+  }
+}
diff --git a/tools/extract_tilegrid.py b/tools/extract_tilegrid.py
index 1411d66..2b6b2ac 100755
--- a/tools/extract_tilegrid.py
+++ b/tools/extract_tilegrid.py
@@ -17,6 +17,7 @@
 
 import sys, re
 import json, argparse
+import machxo2_tiles
 
 tile_re = re.compile(
     r'^Tile\s+([A-Z0-9a-z_/]+)\s+\((\d+), (\d+)\)\s+bitmap offset\s+\((\d+), (\d+)\)\s+\<([A-Z0-9a-z_/]+)>\s*$')
@@ -25,6 +26,10 @@
     r'^\s+([A-Z0-9_]+) \((-?\d+), (-?\d+)\)')
 
 parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument('family', type=str,
+                    help="device family")
+parser.add_argument('device', type=str,
+                    help="device name")
 parser.add_argument('infile', type=argparse.FileType('r'),
                     help="input file from bstool")
 parser.add_argument('outfile', type=argparse.FileType('w'),
@@ -34,19 +39,38 @@
     args = parser.parse_args(argv[1:])
     tiles = {}
     current_tile = None
+    family = args.family
+    device = args.device
     for line in args.infile:
         tile_m = tile_re.match(line)
         if tile_m:
             name = tile_m.group(6)
-            current_tile = {
-                "type": tile_m.group(1),
-                "start_bit": int(tile_m.group(4)),
-                "start_frame": int(tile_m.group(5)),
-                "rows": int(tile_m.group(2)),
-                "cols": int(tile_m.group(3)),
-                "sites": []
-            }
-            identifier = name + ":" + tile_m.group(1)
+            if (family == "ECP5"):
+                current_tile = {
+                    "type": tile_m.group(1),
+                    "start_bit": int(tile_m.group(4)),
+                    "start_frame": int(tile_m.group(5)),
+                    "rows": int(tile_m.group(2)),
+                    "cols": int(tile_m.group(3)),
+                    "sites": []
+                }
+                identifier = name + ":" + tile_m.group(1)
+            elif (family == "MachXO2"):
+                # MachXO2 has the frames/bit and the rows/cols index reversed
+                # compared to ECP5
+                current_tile = {
+                    "type": tile_m.group(1),
+                    "start_bit": int(tile_m.group(5)),
+                    "start_frame": int(tile_m.group(4)),
+                    "rows": int(tile_m.group(3)),
+                    "cols": int(tile_m.group(2)),
+                    "sites": []
+                }
+                identifier = machxo2_tiles.fix_name(name, device) + ":" + tile_m.group(1)
+            else:
+                print("Unknown family: " + family)
+                sys.exit(1)
+
             assert identifier not in tiles
             tiles[identifier] = current_tile
         else:
diff --git a/tools/get_tilegrid_all.py b/tools/get_tilegrid_all.py
index 5e687b9..5882891 100755
--- a/tools/get_tilegrid_all.py
+++ b/tools/get_tilegrid_all.py
@@ -23,7 +23,7 @@
         for device in sorted(devices["families"][family]["devices"].keys()):
             diamond.run(device, "work_tilegrid/wire.v")
             output_file = path.join(database.get_db_subdir(family, device), "tilegrid.json")
-            extract_tilegrid.main(["extract_tilegrid", "work_tilegrid/wire.tmp/output.test", output_file])
+            extract_tilegrid.main(["extract_tilegrid", family, device, "work_tilegrid/wire.tmp/output.test", output_file])
 
 
 if __name__ == "__main__":
diff --git a/tools/html_tilegrid.py b/tools/html_tilegrid.py
index cc7e683..67e1b09 100755
--- a/tools/html_tilegrid.py
+++ b/tools/html_tilegrid.py
@@ -46,6 +46,7 @@
 
     max_row = device_info["max_row"]
     max_col = device_info["max_col"]
+    col_bias = device_info["col_bias"]
 
     tiles = []
     for i in range(max_row + 1):
@@ -71,10 +72,10 @@
     for trow in tiles:
         print("<tr>", file=f)
         row_max_height = 0
-        for tloc in trow:
+        for tloc in trow[col_bias:]:
             row_max_height = max(row_max_height, len(tloc))
         row_height = max(75, 30 * row_max_height)
-        for tloc in trow:
+        for tloc in trow[col_bias:]:
             print("<td style='border: 2px solid black; height: {}px'>".format(row_height), file=f)
             for tile in tloc:
                 print(
diff --git a/tools/machxo2_tiles.py b/tools/machxo2_tiles.py
new file mode 100644
index 0000000..6def0de
--- /dev/null
+++ b/tools/machxo2_tiles.py
@@ -0,0 +1,103 @@
+"""
+This helps fix missing tile positions in the MachXO2 family
+"""
+
+import re, sys
+import database
+
+rc_regex = re.compile(r"[A-Za-z0-9_]*R(\d+)C(\d+)")
+# MachXO2-specific
+center_regex = re.compile(r"CENTER(\d+)")
+centerb_regex = re.compile(r"CENTER_B")
+centert_regex = re.compile(r"CENTER_T")
+centerebr_regex = re.compile(r"CENTER_EBR(\d+)")
+t_regex = re.compile(r"[A-Za-z0-9_]*T(\d+)")
+b_regex = re.compile(r"[A-Za-z0-9_]*B(\d+)")
+l_regex = re.compile(r"[A-Za-z0-9_]*L(\d+)")
+r_regex = re.compile(r"[A-Za-z0-9_]*R(\d+)")
+
+def append_position(left_side, row, col):
+    return left_side + "_R" + str(row) + "C" + str(col)
+
+def update_name(name, center_col, ebr_row, right, bottom):
+    # Match in order of most-specific to least-specific
+    # (E.g. CENTER matches "R" regex too)
+    left = 1
+    top = 0
+
+    row = top
+    col = left
+
+    rc = rc_regex.match(name)
+    if rc:
+        # no need to rename
+        return name
+
+    centert = centert_regex.match(name)
+    if centert:
+        row = top
+        col = center_col
+        return append_position(name, row, col)
+
+    centerb = centerb_regex.match(name)
+    if centerb:
+        row = bottom
+        col = center_col
+        return append_position(name, row, col)
+
+    centerebr = centerebr_regex.match(name)
+    if centerebr:
+        row = ebr_row
+        col = center_col+1
+        return append_position(name, row, col)
+
+    center = center_regex.match(name)
+    if center:
+        row = int(center.group(1))
+        col = center_col
+        return append_position(name, row, col)
+
+    t = t_regex.match(name)
+    if t:
+        row = top
+        col = int(t.group(1))
+        return append_position(name, row, col)
+
+    b = b_regex.match(name)
+    if b:
+        row = bottom
+        col = int(b.group(1))
+        return append_position(name, row, col)
+
+    l = l_regex.match(name)
+    if l:
+        row = int(l.group(1))
+        col = left
+        return append_position(name, row, col)
+
+    r = r_regex.match(name)
+    if r:
+        row = int(r.group(1))
+        col = right
+        return append_position(name, row, col)
+
+def fix_name(name, device):
+
+    device_info = database.get_devices()["families"]["MachXO2"]["devices"][device]
+    max_row = device_info["max_row"]
+    max_col = device_info["max_col"]
+
+    if (device == 'LCMXO2-256HC'):
+        center_col = 5
+        ebr_row = 0 # don't care
+    elif (device == 'LCMXO2-1200HC'):
+        center_col = 13
+        ebr_row = 6
+    elif (device == 'LCMXO2-4000HC'):
+        center_col = 16
+        ebr_row = 11
+    else:
+        print("unknown device: " + device)
+        sys.exit(1)
+
+    return update_name(name, center_col, ebr_row, max_col, max_row)