libtrellis: Add experimental support for packing compressed bitstreams

Signed-off-by: David Shah <dave@ds0.me>
diff --git a/libtrellis/src/Bitstream.cpp b/libtrellis/src/Bitstream.cpp
index 991851f..9548759 100644
--- a/libtrellis/src/Bitstream.cpp
+++ b/libtrellis/src/Bitstream.cpp
@@ -10,7 +10,7 @@
 #include <iomanip>
 #include <fstream>
 #include <array>
-
+#include <queue>
 namespace Trellis {
 
 static const uint16_t CRC16_POLY = 0x8005;
@@ -175,6 +175,122 @@
         // if remaining bits > 0 they are just padding bits added to the end so we can ignore them
     }
 
+    // Decode a onehot byte, -1 if not onehot
+    int decode_onehot(uint8_t in) {
+        switch(in) {
+            case 0b00000001:
+                return 0;
+            case 0b00000010:
+                return 1;
+            case 0b00000100:
+                return 2;
+            case 0b00001000:
+                return 3;
+            case 0b00010000:
+                return 4;
+            case 0b00100000:
+                return 5;
+            case 0b01000000:
+                return 6;
+            case 0b10000000:
+                return 7;
+            default:
+                return -1;
+        }
+    }
+
+    void write_compressed_frames(const std::vector<std::vector<uint8_t>> &frames_in) {
+        // Build a histogram of bytes to aid creating the dictionary
+        int histogram[256];
+        for (int i = 0; i < 256; i++)
+            histogram[i] = 0;
+        for (auto &fr : frames_in)
+            for (auto b : fr)
+                ++histogram[b];
+        std::priority_queue<std::pair<int, uint8_t>> most_frequent;
+        for (int i = 0; i < 256; i++)
+            if (i != 0 && (decode_onehot(i) == -1)) // these values are special and don't go in dict
+                most_frequent.push(std::make_pair(histogram[i], i));
+        uint8_t dict_entries[8];
+        for (int i = 0; i < 8; i++) {
+            dict_entries[i] = most_frequent.top().second;
+            most_frequent.pop();
+        }
+        // Write dictionary
+        write_byte(uint8_t(BitstreamCommand::LSC_WRITE_COMP_DIC));
+        insert_zeros(3);
+        for (int i = 7; i >= 0; i--)
+            write_byte(dict_entries[i]);
+        // Write data
+        write_byte(uint8_t(BitstreamCommand::LSC_PROG_INCR_CMP));
+        write_byte(0x91); //CRC check, 1 dummy byte
+        uint16_t frames = uint16_t(frames_in.size());
+        write_byte(uint8_t((frames >> 8) & 0xFF));
+        write_byte(uint8_t(frames & 0xFF));
+
+        // For writing a stream of bits
+        uint8_t buffer = 0;
+        int bits_in_buffer = 0;
+        auto flush_bits = [&]() {
+            if (bits_in_buffer != 0) {
+                write_byte(buffer);
+                buffer = 0;
+                bits_in_buffer = 0;
+            }
+        };
+        auto add_bit = [&](bool bit) {
+            if (bit)
+                buffer |= (1 << (7 - bits_in_buffer));
+            bits_in_buffer++;
+            if (bits_in_buffer == 8)
+                flush_bits();
+        };
+        auto add_bits = [&](uint32_t x, int len) {
+            for (int i = len-1; i >= 0; i--)
+                add_bit((x & (1 << i)) != 0);
+        };
+        // Add zero bytes (represented by zero bits in the bitstream)
+        // to pad frame to 64 bits
+        for (auto &fr : frames_in) {
+            int frame_bytes = int(fr.size());
+            if ((frame_bytes % 8) != 0)
+                for (int i = 0; i < (8 - (frame_bytes % 8)); i++)
+                    add_bit(0);
+            // Process bytes of frames
+            for (auto b : fr) {
+                if (b == 0) {
+                    add_bit(0); // 0 bit -> 0 byte
+                    continue;
+                }
+                int oh = decode_onehot(b);
+                if (oh != -1) {
+                    add_bits(0b100, 3); // 0b100xxx -> only bit xxx set in byte
+                    add_bits(oh, 3);
+                    continue;
+                }
+                // Search dictionary
+                for (int j = 0; j < 8; j++)
+                    if (dict_entries[j] == b) {
+                        add_bits(0b101, 3); // 0b101xxx -> dictionary entry xxx
+                        add_bits(j, 3);
+                        goto dict_found;
+                    }
+                if (false) {
+                dict_found:
+                    continue;
+                }
+                // Uncompressable byte; use literal
+                add_bits(0b11, 2); // 0b11xxxxxxxx -> literal byte
+                add_bits(b, 8);
+            }
+            // This ensures compressed frame is 8-bit aligned
+            flush_bits();
+            // Post-frame CRC and 0xFF byte
+            insert_crc16();
+            write_byte(0xFF);
+        }
+    }
+
     // Write multiple bytes from an InputIterator and update CRC
     template<typename T>
     void write_bytes(T in, size_t count) {
@@ -620,27 +736,49 @@
     // Init address
     wr.write_byte(uint8_t(BitstreamCommand::LSC_INIT_ADDRESS));
     wr.insert_zeros(3);
-    // Bitstream data
-    wr.write_byte(uint8_t(BitstreamCommand::LSC_PROG_INCR_RTI));
-    wr.write_byte(0x91); //CRC check, 1 dummy byte
-    uint16_t frames = uint16_t(chip.info.num_frames);
-    wr.write_byte(uint8_t((frames >> 8) & 0xFF));
-    wr.write_byte(uint8_t(frames & 0xFF));
-    size_t bytes_per_frame = (chip.info.bits_per_frame + chip.info.pad_bits_after_frame +
-                              chip.info.pad_bits_before_frame) / 8U;
-    unique_ptr<uint8_t[]> frame_bytes = make_unique<uint8_t[]>(bytes_per_frame);
-    for (size_t i = 0; i < frames; i++) {
-        fill(frame_bytes.get(), frame_bytes.get() + bytes_per_frame, 0x00);
-        for (int j = 0; j < chip.info.bits_per_frame; j++) {
-            size_t ofs = j + chip.info.pad_bits_after_frame;
-            assert(((bytes_per_frame - 1) - (ofs / 8)) < bytes_per_frame);
-            frame_bytes[(bytes_per_frame - 1) - (ofs / 8)] |=
-                    (chip.cram.bit((chip.info.num_frames - 1) - i, j) & 0x01) << (ofs % 8);
+    if (options.count("compress") && options.at("compress") == "yes") {
+        // First create an uncompressed array of frames
+        std::vector<std::vector<uint8_t>> frames_data;
+        uint16_t frames = uint16_t(chip.info.num_frames);
+        size_t bytes_per_frame = (chip.info.bits_per_frame + chip.info.pad_bits_after_frame +
+                                  chip.info.pad_bits_before_frame) / 8U;
+        for (size_t i = 0; i < frames; i++) {
+            frames_data.emplace_back();
+            auto &frame_bytes = frames_data.back();
+            frame_bytes.resize(bytes_per_frame);
+            for (int j = 0; j < chip.info.bits_per_frame; j++) {
+                size_t ofs = j + chip.info.pad_bits_after_frame;
+                assert(((bytes_per_frame - 1) - (ofs / 8)) < bytes_per_frame);
+                frame_bytes[(bytes_per_frame - 1) - (ofs / 8)] |=
+                        (chip.cram.bit((chip.info.num_frames - 1) - i, j) & 0x01) << (ofs % 8);
+            }
         }
-        wr.write_bytes(frame_bytes.get(), bytes_per_frame);
-        wr.insert_crc16();
-        wr.write_byte(0xFF);
+        // Then compress and write
+        wr.write_compressed_frames(frames_data);
+    } else {
+        // Bitstream data
+        wr.write_byte(uint8_t(BitstreamCommand::LSC_PROG_INCR_RTI));
+        wr.write_byte(0x91); //CRC check, 1 dummy byte
+        uint16_t frames = uint16_t(chip.info.num_frames);
+        wr.write_byte(uint8_t((frames >> 8) & 0xFF));
+        wr.write_byte(uint8_t(frames & 0xFF));
+        size_t bytes_per_frame = (chip.info.bits_per_frame + chip.info.pad_bits_after_frame +
+                                  chip.info.pad_bits_before_frame) / 8U;
+        unique_ptr<uint8_t[]> frame_bytes = make_unique<uint8_t[]>(bytes_per_frame);
+        for (size_t i = 0; i < frames; i++) {
+            fill(frame_bytes.get(), frame_bytes.get() + bytes_per_frame, 0x00);
+            for (int j = 0; j < chip.info.bits_per_frame; j++) {
+                size_t ofs = j + chip.info.pad_bits_after_frame;
+                assert(((bytes_per_frame - 1) - (ofs / 8)) < bytes_per_frame);
+                frame_bytes[(bytes_per_frame - 1) - (ofs / 8)] |=
+                        (chip.cram.bit((chip.info.num_frames - 1) - i, j) & 0x01) << (ofs % 8);
+            }
+            wr.write_bytes(frame_bytes.get(), bytes_per_frame);
+            wr.insert_crc16();
+            wr.write_byte(0xFF);
+        }
     }
+
     // Post-bitstream space for SECURITY and SED (not used here)
     wr.insert_dummy(12);
     // Program Usercode
diff --git a/libtrellis/tools/ecppack.cpp b/libtrellis/tools/ecppack.cpp
index dca3bb1..f060468 100644
--- a/libtrellis/tools/ecppack.cpp
+++ b/libtrellis/tools/ecppack.cpp
@@ -45,6 +45,7 @@
     options.add_options()("freq", po::value<std::string>(), "config frequency in MHz");
     options.add_options()("svf", po::value<std::string>(), "output SVF file");
     options.add_options()("svf-rowsize", po::value<int>(), "SVF row size in bits (default 8000)");
+    options.add_options()("compress", "compress bitstream to reduce size");
     options.add_options()("spimode", po::value<std::string>(), "SPI Mode to use (fast-read, dual-spi, qspi)");
     options.add_options()("background", "enable background reconfiguration in bitstream");
     options.add_options()("delta", po::value<std::string>(), "create a delta partial bitstream given a reference config");
@@ -132,6 +133,9 @@
     if (vm.count("spimode"))
         bitopts["spimode"] = vm["spimode"].as<string>();
 
+    if (vm.count("compress"))
+        bitopts["compress"] = "yes";
+
     if (vm.count("background")) {
         auto tile_db = get_tile_bitdata(TileLocator{c.info.family, c.info.name, "EFB0_PICB0"});
         auto esb = tile_db->get_data_for_enum("SYSCONFIG.BACKGROUND_RECONFIG");
@@ -209,15 +213,16 @@
     }
 
     if (vm.count("svf")) {
-
         // Create JTAG bitstream without SPI flash related settings, as these
         // seem to confuse the chip sometimes when configuring over JTAG
-        if (!bitopts.empty()) {
+        if (!bitopts.empty() && !(bitopts.size() == 1 && bitopts.count("compress"))) {
             bitopts.clear();
             if (vm.count("background"))
                 bitopts["background"] = "yes";
             if (vm.count("bootaddr"))
                 bitopts["multiboot"] = "yes";
+            if (vm.count("compress"))
+                bitopts["compress"] = "yes";
             b = Bitstream::serialise_chip(c, bitopts);
         }