Merge pull request #116 from samycharas/add_ql_dsp

Add ql-dsp plugin
diff --git a/.gitignore b/.gitignore
index 4c85aa2..56d3484 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 *.so
 *.swp
 *.log
+ql-qlf-plugin/pmgen/*
diff --git a/ql-qlf-plugin/Makefile b/ql-qlf-plugin/Makefile
index 402606a..0807881 100644
--- a/ql-qlf-plugin/Makefile
+++ b/ql-qlf-plugin/Makefile
@@ -1,5 +1,6 @@
 NAME = ql-qlf
-SOURCES = synth_quicklogic.cc
+SOURCES = synth_quicklogic.cc \
+	  ql-dsp.cc
 include ../Makefile_plugin.common
 
 COMMON		= common
@@ -14,9 +15,18 @@
 		  $(QLF_K6N10_DIR)/qlf_k6n10_brams.txt 		\
 		  $(QLF_K6N10_DIR)/qlf_k6n10_cells_sim.v 	\
 		  $(QLF_K6N10_DIR)/qlf_k6n10_ffs_map.v 		\
+		  $(QLF_K6N10_DIR)/qlf_k6n10_dsp_map.v 		\
 		  $(QLF_K6N10_DIR)/qlf_k6n10_lut_map.v
 
+retrieve-pmgen:=$(shell mkdir pmgen && wget -nc -O pmgen/pmgen.py https://raw.githubusercontent.com/SymbiFlow/yosys/master%2Bwip/passes/pmgen/pmgen.py)
+
+pre-build:=$(shell python3 pmgen/pmgen.py -o pmgen/ql-dsp-pm.h -p ql_dsp ql_dsp.pmg)
+
 install_modules: $(VERILOG_MODULES)
 	$(foreach f,$^,install -D $(f) $(DATA_DIR)/quicklogic/$(notdir $(f));)
 
 install: install_modules
+
+clean:
+	$(MAKE) -f ../Makefile_plugin.common $@
+	rm -f *pm.h
diff --git a/ql-qlf-plugin/ql-dsp.cc b/ql-qlf-plugin/ql-dsp.cc
new file mode 100644
index 0000000..1b03369
--- /dev/null
+++ b/ql-qlf-plugin/ql-dsp.cc
@@ -0,0 +1,164 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2021 QuickLogic Corp.
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "kernel/sigtools.h"
+#include "kernel/yosys.h"
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+#include "pmgen/ql-dsp-pm.h"
+
+void create_ql_dsp(ql_dsp_pm &pm)
+{
+    auto &st = pm.st_ql_dsp;
+
+    log("Checking %s.%s for QL DSP inference.\n", log_id(pm.module), log_id(st.mul));
+
+    log_debug("ffA:    %s\n", log_id(st.ffA, "--"));
+    log_debug("ffB:    %s\n", log_id(st.ffB, "--"));
+    log_debug("ffCD:   %s\n", log_id(st.ffCD, "--"));
+    log_debug("mul:    %s\n", log_id(st.mul, "--"));
+    log_debug("ffFJKG: %s\n", log_id(st.ffFJKG, "--"));
+    log_debug("ffH:    %s\n", log_id(st.ffH, "--"));
+    log_debug("add:    %s\n", log_id(st.add, "--"));
+    log_debug("mux:    %s\n", log_id(st.mux, "--"));
+    log_debug("ffO:    %s\n", log_id(st.ffO, "--"));
+    log_debug("\n");
+
+    if (GetSize(st.sigA) > 16) {
+        log("  input A (%s) is too large (%d > 16).\n", log_signal(st.sigA), GetSize(st.sigA));
+        return;
+    }
+
+    if (GetSize(st.sigB) > 16) {
+        log("  input B (%s) is too large (%d > 16).\n", log_signal(st.sigB), GetSize(st.sigB));
+        return;
+    }
+
+    if (GetSize(st.sigO) > 33) {
+        log("  adder/accumulator (%s) is too large (%d > 33).\n", log_signal(st.sigO), GetSize(st.sigO));
+        return;
+    }
+
+    if (GetSize(st.sigH) > 32) {
+        log("  output (%s) is too large (%d > 32).\n", log_signal(st.sigH), GetSize(st.sigH));
+        return;
+    }
+
+    Cell *cell = st.mul;
+    if (cell->type == ID($mul)) {
+        log("  replacing %s with QL_DSP cell.\n", log_id(st.mul->type));
+
+        cell = pm.module->addCell(NEW_ID, ID(QL_DSP));
+        pm.module->swap_names(cell, st.mul);
+    } else
+        log_assert(cell->type == ID(QL_DSP));
+
+    // QL_DSP Input Interface
+    SigSpec A = st.sigA;
+    A.extend_u0(16, st.mul->getParam(ID::A_SIGNED).as_bool());
+    log_assert(GetSize(A) == 16);
+
+    SigSpec B = st.sigB;
+    B.extend_u0(16, st.mul->getParam(ID::B_SIGNED).as_bool());
+    log_assert(GetSize(B) == 16);
+
+    SigSpec CD = st.sigCD;
+    if (CD.empty())
+        CD = RTLIL::Const(0, 32);
+    else
+        log_assert(GetSize(CD) == 32);
+
+    cell->setPort(ID::A, A);
+    cell->setPort(ID::B, B);
+    cell->setPort(ID::C, CD.extract(16, 16));
+    cell->setPort(ID::D, CD.extract(0, 16));
+
+    cell->setParam(ID(A_REG), st.ffA ? State::S1 : State::S0);
+    cell->setParam(ID(B_REG), st.ffB ? State::S1 : State::S0);
+    cell->setParam(ID(C_REG), st.ffCD ? State::S1 : State::S0);
+    cell->setParam(ID(D_REG), st.ffCD ? State::S1 : State::S0);
+
+    // QL_DSP Output Interface
+
+    SigSpec O = st.sigO;
+    int O_width = GetSize(O);
+    if (O_width == 33) {
+        log_assert(st.add);
+        // If we have a signed multiply-add, then perform sign extension
+        if (st.add->getParam(ID::A_SIGNED).as_bool() && st.add->getParam(ID::B_SIGNED).as_bool())
+            pm.module->connect(O[32], O[31]);
+        else
+            cell->setPort(ID::CO, O[32]);
+        O.remove(O_width - 1);
+    } else
+        cell->setPort(ID::CO, pm.module->addWire(NEW_ID));
+    log_assert(GetSize(O) <= 32);
+    if (GetSize(O) < 32)
+        O.append(pm.module->addWire(NEW_ID, 32 - GetSize(O)));
+
+    cell->setPort(ID::O, O);
+
+    cell->setParam(ID::A_SIGNED, st.mul->getParam(ID::A_SIGNED).as_bool());
+    cell->setParam(ID::B_SIGNED, st.mul->getParam(ID::B_SIGNED).as_bool());
+
+    if (cell != st.mul)
+        pm.autoremove(st.mul);
+    else
+        pm.blacklist(st.mul);
+    pm.autoremove(st.ffFJKG);
+    pm.autoremove(st.add);
+}
+
+struct QlDspPass : public Pass {
+    QlDspPass() : Pass("ql_dsp", "ql: map multipliers") {}
+    void help() override
+    {
+        //   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
+        log("\n");
+        log("    ql_dsp [options] [selection]\n");
+        log("\n");
+        log("Map multipliers ($mul/QL_DSP) and multiply-accumulate ($mul/QL_DSP + $add)\n");
+        log("cells into ql DSP resources.\n");
+        log("Pack input registers (A, B, {C,D}), pipeline registers\n");
+        log("({F,J,K,G}, H), output registers (O -- full 32-bits or lower 16-bits only); \n");
+        log("and post-adder into into the QL_DSP resource.\n");
+        log("\n");
+        log("Multiply-accumulate operations using the post-adder with feedback on the {C,D}\n");
+        log("input will be folded into the DSP. In this scenario only, resetting the\n");
+        log("the accumulator to an arbitrary value can be inferred to use the {C,D} input.\n");
+        log("\n");
+    }
+    void execute(std::vector<std::string> args, RTLIL::Design *design) override
+    {
+        log_header(design, "Executing ql_DSP pass (map multipliers).\n");
+
+        size_t argidx;
+        for (argidx = 1; argidx < args.size(); argidx++) {
+            break;
+        }
+        extra_args(args, argidx, design);
+
+        for (auto module : design->selected_modules())
+            ql_dsp_pm(module, module->selected_cells()).run_ql_dsp(create_ql_dsp);
+    }
+} QlDspPass;
+
+PRIVATE_NAMESPACE_END
diff --git a/ql-qlf-plugin/ql-qlf-k6n10/qlf_k6n10_cells_sim.v b/ql-qlf-plugin/ql-qlf-k6n10/qlf_k6n10_cells_sim.v
index 2d415b6..b3f8357 100644
--- a/ql-qlf-plugin/ql-qlf-k6n10/qlf_k6n10_cells_sim.v
+++ b/ql-qlf-plugin/ql-qlf-k6n10/qlf_k6n10_cells_sim.v
@@ -387,3 +387,59 @@
     end
 endmodule
 
+module QL_DSP (
+    input CLK,
+    input [15:0] A, B, C, D,
+    output [31:0] O,
+    output CO // Currently unused, left in case we want to support signed operations in the future.
+);
+    parameter [0:0] A_REG = 0;
+    parameter [0:0] B_REG = 0;
+    parameter [0:0] C_REG = 0;
+    parameter [0:0] D_REG = 0;
+    parameter [0:0] ENABLE_DSP = 0;
+    parameter [0:0] A_SIGNED = 0;
+    parameter [0:0] B_SIGNED = 0;
+
+    wire [15:0] iA, iB, iC, iD;
+    wire [15:0] iF, iJ, iK, iG;
+
+    // Regs C and A, currently unused
+    reg [15:0] rC, rA;
+
+    assign iC = C_REG ? rC : C;
+    assign iA = A_REG ? rA : A;
+
+    // Regs B and D, currently unused
+    reg [15:0] rB, rD;
+
+    assign iB = B_REG ? rB : B;
+    assign iD = D_REG ? rD : D;
+
+    // Multiplier Stage
+    wire [15:0] p_Ah_Bh, p_Al_Bh, p_Ah_Bl, p_Al_Bl;
+    wire [15:0] Ah, Al, Bh, Bl;
+    assign Ah = {A_SIGNED ? {8{iA[15]}} : 8'b0, iA[15: 8]};
+    assign Al = {8'b0, iA[ 7: 0]};
+    assign Bh = {B_SIGNED ? {8{iB[15]}} : 8'b0, iB[15: 8]};
+    assign Bl = {8'b0, iB[ 7: 0]};
+    assign p_Ah_Bh = Ah * Bh; // F
+    assign p_Al_Bh = {8'b0, Al[7:0]} * Bh; // J
+    assign p_Ah_Bl = Ah * {8'b0, Bl[7:0]}; // K
+    assign p_Al_Bl = Al * Bl; // G
+
+    assign iF = p_Ah_Bh;
+    assign iJ = p_Al_Bh;
+
+    assign iK = p_Ah_Bl;
+    assign iG = p_Al_Bl;
+
+    // Adder Stage
+    wire [23:0] iK_e = {A_SIGNED ? {8{iK[15]}} : 8'b0, iK};
+    wire [23:0] iJ_e = {B_SIGNED ? {8{iJ[15]}} : 8'b0, iJ};
+    assign iL = iG + (iK_e << 8) + (iJ_e << 8) + (iF << 16);
+
+    // Output Stage
+    assign O = iL;
+
+endmodule
diff --git a/ql-qlf-plugin/ql-qlf-k6n10/qlf_k6n10_dsp_map.v b/ql-qlf-plugin/ql-qlf-k6n10/qlf_k6n10_dsp_map.v
new file mode 100644
index 0000000..bab24e5
--- /dev/null
+++ b/ql-qlf-plugin/ql-qlf-k6n10/qlf_k6n10_dsp_map.v
@@ -0,0 +1,19 @@
+module \$__MUL16X16 (input [15:0] A, input [15:0] B, output [31:0] Y);
+	parameter A_SIGNED = 0;
+	parameter B_SIGNED = 0;
+	parameter A_WIDTH = 0;
+	parameter B_WIDTH = 0;
+	parameter Y_WIDTH = 0;
+
+	QL_DSP #(
+		.A_REG(1'b0),
+		.B_REG(1'b0),
+		.C_REG(1'b0),
+		.D_REG(1'b0),
+		.ENABLE_DSP(1'b1),
+	) _TECHMAP_REPLACE_ (
+		.A(A),
+		.B(B),
+		.O(Y),
+	);
+endmodule
diff --git a/ql-qlf-plugin/ql_dsp.pmg b/ql-qlf-plugin/ql_dsp.pmg
new file mode 100644
index 0000000..e607e66
--- /dev/null
+++ b/ql-qlf-plugin/ql_dsp.pmg
@@ -0,0 +1,415 @@
+pattern ql_dsp
+
+state <SigBit> clock
+state <bool> clock_pol cd_signed o_lo
+state <SigSpec> sigA sigB sigCD sigH sigO
+state <Cell*> add mux
+state <IdString> addAB muxAB
+
+state <Cell*> ffA ffB ffCD
+state <Cell*> ffFJKG ffH ffO
+//
+// subpattern
+state <bool> argSdff
+state <SigSpec> argQ argD
+udata <SigSpec> dffD dffQ
+udata <SigBit> dffclock
+udata <Cell*> dff
+udata <bool> dffclock_pol
+
+match mul
+	select mul->type.in($mul, \QL_DSP)
+	select GetSize(mul->getPort(\A)) + GetSize(mul->getPort(\B)) > 10
+endmatch
+
+code sigA sigB sigH
+	auto unextend = [](const SigSpec &sig) {
+		int i;
+		for (i = GetSize(sig)-1; i > 0; i--)
+			if (sig[i] != sig[i-1])
+				break;
+		// Do not remove non-const sign bit
+		if (sig[i].wire)
+			++i;
+		return sig.extract(0, i);
+	};
+	sigA = unextend(port(mul, \A));
+	sigB = unextend(port(mul, \B));
+
+	SigSpec O;
+	if (mul->type == $mul)
+		O = mul->getPort(\Y);
+	else if (mul->type == \QL_DSP)
+		O = mul->getPort(\O);
+	else log_abort();
+	if (GetSize(O) <= 10)
+		reject;
+
+	// Only care about those bits that are used
+	int i;
+	for (i = 0; i < GetSize(O); i++) {
+		if (nusers(O[i]) <= 1)
+			break;
+		sigH.append(O[i]);
+	}
+	// This sigM could have no users if downstream sinks (e.g. $add) is
+	//   narrower than $mul result, for example
+	if (i == 0)
+		reject;
+
+	log_assert(nusers(O.extract_end(i)) <= 1);
+endcode
+
+code argQ ffA sigA clock clock_pol
+	if (mul->type != \QL_DSP || !param(mul, \A_REG).as_bool()) {
+		argQ = sigA;
+		subpattern(in_dffe);
+		if (dff) {
+			ffA = dff;
+			clock = dffclock;
+			clock_pol = dffclock_pol;
+			sigA = dffD;
+		}
+	}
+endcode
+
+code argQ ffB sigB clock clock_pol
+	if (mul->type != \QL_DSP || !param(mul, \B_REG).as_bool()) {
+		argQ = sigB;
+		subpattern(in_dffe);
+		if (dff) {
+			ffB = dff;
+			clock = dffclock;
+			clock_pol = dffclock_pol;
+			sigB = dffD;
+		}
+	}
+endcode
+
+code argD argSdff ffFJKG sigH clock clock_pol
+	if (nusers(sigH) == 2 &&
+			(mul->type != \QL_DSP)) {
+		argD = sigH;
+		argSdff = false;
+		subpattern(out_dffe);
+		if (dff) {
+			// F/J/K/G do not have a CE-like (hold) input
+			if (dff->hasPort(\EN))
+				goto reject_ffFJKG;
+
+			// Reset signal of F/J (IRSTTOP) and K/G (IRSTBOT)
+			//   shared with A and B
+			if (ffA) {
+				if (ffA->hasPort(\ARST) != dff->hasPort(\ARST))
+					goto reject_ffFJKG;
+				if (ffA->hasPort(\ARST)) {
+					if (port(ffA, \ARST) != port(dff, \ARST))
+						goto reject_ffFJKG;
+					if (param(ffA, \ARST_POLARITY) != param(dff, \ARST_POLARITY))
+						goto reject_ffFJKG;
+				}
+			}
+			if (ffB) {
+				if (ffB->hasPort(\ARST) != dff->hasPort(\ARST))
+					goto reject_ffFJKG;
+				if (ffB->hasPort(\ARST)) {
+					if (port(ffB, \ARST) != port(dff, \ARST))
+						goto reject_ffFJKG;
+					if (param(ffB, \ARST_POLARITY) != param(dff, \ARST_POLARITY))
+						goto reject_ffFJKG;
+				}
+			}
+
+			ffFJKG = dff;
+			clock = dffclock;
+			clock_pol = dffclock_pol;
+			sigH = dffQ;
+
+reject_ffFJKG: 		;
+		}
+	}
+endcode
+
+code argD argSdff ffH sigH sigO clock clock_pol
+	if (ffFJKG && nusers(sigH) == 2 &&
+			(mul->type != \QL_DSP)) {
+		argD = sigH;
+		argSdff = false;
+		subpattern(out_dffe);
+		if (dff) {
+			// H does not have a CE-like (hold) input
+			if (dff->hasPort(\EN))
+				goto reject_ffH;
+
+			// Reset signal of H (IRSTBOT) shared with B
+			if (ffB->hasPort(\ARST) != dff->hasPort(\ARST))
+				goto reject_ffH;
+			if (ffB->hasPort(\ARST)) {
+				if (port(ffB, \ARST) != port(dff, \ARST))
+					goto reject_ffH;
+				if (param(ffB, \ARST_POLARITY) != param(dff, \ARST_POLARITY))
+					goto reject_ffH;
+			}
+
+			ffH = dff;
+			clock = dffclock;
+			clock_pol = dffclock_pol;
+			sigH = dffQ;
+
+reject_ffH:		;
+		}
+	}
+
+	sigO = sigH;
+endcode
+
+match add
+	if mul->type != \QL_DSP || (param(mul, \ENABLE_DSP).as_int() == 1)
+
+	select add->type.in($add)
+	choice <IdString> AB {\A, \B}
+	select nusers(port(add, AB)) == 2
+
+	index <SigBit> port(add, AB)[0] === sigH[0]
+	filter GetSize(port(add, AB)) <= GetSize(sigH)
+	filter port(add, AB) == sigH.extract(0, GetSize(port(add, AB)))
+	filter nusers(sigH.extract_end(GetSize(port(add, AB)))) <= 1
+	set addAB AB
+	optional
+endmatch
+
+code sigCD sigO cd_signed
+	if (add) {
+		sigCD = port(add, addAB == \A ? \B : \A);
+		cd_signed = param(add, addAB == \A ? \B_SIGNED : \A_SIGNED).as_bool();
+
+		int natural_mul_width = GetSize(sigA) + GetSize(sigB);
+		int actual_mul_width = GetSize(sigH);
+		int actual_acc_width = GetSize(sigCD);
+
+		if ((actual_acc_width > actual_mul_width) && (natural_mul_width > actual_mul_width))
+			reject;
+		// If accumulator, check adder width and signedness
+		if (sigCD == sigH && (actual_acc_width != actual_mul_width) && (param(mul, \A_SIGNED).as_bool() != param(add, \A_SIGNED).as_bool()))
+			reject;
+
+		sigO = port(add, \Y);
+	}
+endcode
+
+match mux
+	select mux->type == $mux
+	choice <IdString> AB {\A, \B}
+	select nusers(port(mux, AB)) == 2
+	index <SigSpec> port(mux, AB) === sigO
+	set muxAB AB
+	optional
+endmatch
+
+code sigO
+	if (mux)
+		sigO = port(mux, \Y);
+endcode
+
+code argD argSdff ffO sigO sigCD clock clock_pol cd_signed o_lo
+	if (mul->type != \QL_DSP ||
+			// Ensure that register is not already used
+			((param(mul, \ENABLE_DSP).as_int() == 1))) {
+
+		dff = nullptr;
+
+		// First try entire sigO
+		if (nusers(sigO) == 2) {
+			argD = sigO;
+			argSdff = !mux;
+			subpattern(out_dffe);
+		}
+
+		// Otherwise try just its least significant 16 bits
+		if (!dff && GetSize(sigO) > 16) {
+			argD = sigO.extract(0, 16);
+			if (nusers(argD) == 2) {
+				argSdff = !mux;
+				subpattern(out_dffe);
+				o_lo = dff;
+			}
+		}
+
+		if (dff) {
+			ffO = dff;
+			clock = dffclock;
+			clock_pol = dffclock_pol;
+
+			sigO.replace(sigO.extract(0, GetSize(dffQ)), dffQ);
+		}
+
+		// Loading value into output register is not
+		//   supported unless using accumulator
+		if (mux) {
+			if (sigCD != sigO)
+				reject;
+			sigCD = port(mux, muxAB == \B ? \A : \B);
+
+			cd_signed = add && param(add, \A_SIGNED).as_bool() && param(add, \B_SIGNED).as_bool();
+		} else if (dff && dff->hasPort(\SRST)) {
+			if (sigCD != sigO)
+				reject;
+			sigCD = param(dff, \SRST_VALUE);
+
+			cd_signed = add && param(add, \A_SIGNED).as_bool() && param(add, \B_SIGNED).as_bool();
+		}
+	}
+endcode
+
+code argQ ffCD sigCD clock clock_pol
+	if (!sigCD.empty() && sigCD != sigO &&
+			(mul->type != \QL_DSP || (!param(mul, \C_REG).as_bool() && !param(mul, \D_REG).as_bool()))) {
+		argQ = sigCD;
+		subpattern(in_dffe);
+		if (dff) {
+			// Reset signal of C (IRSTTOP) and D (IRSTBOT)
+			//   shared with A and B
+			if (ffA) {
+				if (ffA->hasPort(\ARST) != dff->hasPort(\ARST))
+					goto reject_ffCD;
+				if (ffA->hasPort(\ARST)) {
+					if (port(ffA, \ARST) != port(dff, \ARST))
+						goto reject_ffCD;
+					if (param(ffA, \ARST_POLARITY) != param(dff, \ARST_POLARITY))
+						goto reject_ffCD;
+				}
+			}
+			if (ffB) {
+				if (ffB->hasPort(\ARST) != dff->hasPort(\ARST))
+					goto reject_ffCD;
+				if (ffB->hasPort(\ARST)) {
+					if (port(ffB, \ARST) != port(dff, \ARST))
+						goto reject_ffCD;
+					if (param(ffB, \ARST_POLARITY) != param(dff, \ARST_POLARITY))
+						goto reject_ffCD;
+				}
+			}
+
+			ffCD = dff;
+			clock = dffclock;
+			clock_pol = dffclock_pol;
+			sigCD = dffD;
+
+reject_ffCD: 		;
+		}
+	}
+endcode
+
+code sigCD
+	sigCD.extend_u0(32, cd_signed);
+endcode
+
+code
+	accept;
+endcode
+
+// #######################
+
+subpattern in_dffe
+arg argD argQ clock clock_pol
+
+code
+	dff = nullptr;
+	if (argQ.empty())
+		reject;
+	for (auto c : argQ.chunks()) {
+		if (!c.wire)
+			reject;
+		if (c.wire->get_bool_attribute(\keep))
+			reject;
+		Const init = c.wire->attributes.at(\init, State::Sx);
+		if (!init.is_fully_undef() && !init.is_fully_zero())
+			reject;
+	}
+endcode
+
+match ff
+	select ff->type.in($dff, $dffe)
+	// DSP48E1 does not support clock inversion
+	select param(ff, \CLK_POLARITY).as_bool()
+
+	slice offset GetSize(port(ff, \D))
+	index <SigBit> port(ff, \Q)[offset] === argQ[0]
+
+	// Check that the rest of argQ is present
+	filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ)
+	filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ
+endmatch
+
+code argQ argD
+{
+	if (clock != SigBit()) {
+		if (port(ff, \CLK) != clock)
+			reject;
+		if (param(ff, \CLK_POLARITY).as_bool() != clock_pol)
+			reject;
+	}
+
+	SigSpec Q = port(ff, \Q);
+	dff = ff;
+	dffclock = port(ff, \CLK);
+	dffclock_pol = param(ff, \CLK_POLARITY).as_bool();
+	dffD = argQ;
+	argD = port(ff, \D);
+	argQ = Q;
+	dffD.replace(argQ, argD);
+}
+endcode
+
+// #######################
+
+subpattern out_dffe
+arg argD argSdff argQ clock clock_pol
+
+code
+	dff = nullptr;
+	for (auto c : argD.chunks())
+		if (c.wire->get_bool_attribute(\keep))
+			reject;
+endcode
+
+match ff
+	select ff->type.in($dff, $dffe, $sdff, $sdffce)
+	// QL_DSP does not support clock inversion
+	select param(ff, \CLK_POLARITY).as_bool()
+
+	slice offset GetSize(port(ff, \D))
+	index <SigBit> port(ff, \D)[offset] === argD[0]
+
+	// Only allow sync reset if requested.
+	filter argSdff || ff->type.in($dff, $dffe)
+	// Check that the rest of argD is present
+	filter GetSize(port(ff, \D)) >= offset + GetSize(argD)
+	filter port(ff, \D).extract(offset, GetSize(argD)) == argD
+endmatch
+
+code argQ
+	if (ff) {
+		if (clock != SigBit()) {
+			if (port(ff, \CLK) != clock)
+				reject;
+			if (param(ff, \CLK_POLARITY).as_bool() != clock_pol)
+				reject;
+		}
+		SigSpec D = port(ff, \D);
+		SigSpec Q = port(ff, \Q);
+		argQ = argD;
+		argQ.replace(D, Q);
+
+		for (auto c : argQ.chunks()) {
+			Const init = c.wire->attributes.at(\init, State::Sx);
+			if (!init.is_fully_undef() && !init.is_fully_zero())
+				reject;
+		}
+
+		dff = ff;
+		dffQ = argQ;
+		dffclock = port(ff, \CLK);
+		dffclock_pol = param(ff, \CLK_POLARITY).as_bool();
+	}
+endcode
diff --git a/ql-qlf-plugin/synth_quicklogic.cc b/ql-qlf-plugin/synth_quicklogic.cc
index 7bb2210..d851d38 100644
--- a/ql-qlf-plugin/synth_quicklogic.cc
+++ b/ql-qlf-plugin/synth_quicklogic.cc
@@ -60,6 +60,10 @@
         log("        write the design to the specified verilog file. writing of an output file\n");
         log("        is omitted if this parameter is not specified.\n");
         log("\n");
+        log("    -no_dsp\n");
+        log("        By default use DSP blocks in output netlist.\n");
+        log("        do not use DSP blocks to implement multipliers and associated logic\n");
+        log("\n");
         log("    -no_adder\n");
         log("        By default use adder cells in output netlist.\n");
         log("        Specifying this switch turns it off.\n");
@@ -78,6 +82,7 @@
     }
 
     string top_opt, edif_file, blif_file, family, currmodule, verilog_file;
+    bool nodsp;
     bool inferAdder;
     bool inferBram;
     bool abcOpt;
@@ -95,6 +100,7 @@
         inferBram = true;
         abcOpt = true;
         noffmap = false;
+        nodsp = false;
     }
 
     void execute(std::vector<std::string> args, RTLIL::Design *design) override
@@ -125,6 +131,10 @@
                 verilog_file = args[++argidx];
                 continue;
             }
+            if (args[argidx] == "-no_dsp") {
+                nodsp = true;
+                continue;
+            }
             if (args[argidx] == "-no_adder") {
                 inferAdder = false;
                 continue;
@@ -185,7 +195,22 @@
             run("peepopt");
             run("pmuxtree");
             run("opt_clean");
-
+            if (help_mode || (!nodsp && family == "qlf_k6n10")) {
+                run("memory_dff");
+                run("wreduce t:$mul");
+                run("techmap -map +/mul2dsp.v -map +/quicklogic/" + family +
+                      "_dsp_map.v -D DSP_A_MAXWIDTH=16 -D DSP_B_MAXWIDTH=16 "
+                      "-D DSP_A_MINWIDTH=2 -D DSP_B_MINWIDTH=2 -D DSP_Y_MINWIDTH=11 "
+                      "-D DSP_NAME=$__MUL16X16",
+                    "(if -no_dsp)");
+                run("select a:mul2dsp", "              (if -no_dsp)");
+                run("setattr -unset mul2dsp", "        (if -no_dsp)");
+                run("opt_expr -fine", "                (if -no_dsp)");
+                run("wreduce", "                       (if -no_dsp)");
+                run("select -clear", "                 (if -no_dsp)");
+                run("ql_dsp", "                        (if -no_dsp)");
+                run("chtype -set $mul t:$__soft_mul", "(if -no_dsp)");
+            }
             run("alumacc");
             run("opt");
             run("fsm");
@@ -229,7 +254,7 @@
                     "$_DLATCHSR_PPP_ 0");
                 //    In case we add clock inversion in the future.
                 //    run("dfflegalize -cell $_DFF_?_ 0 -cell $_DFF_?P?_ 0 -cell $_DFFE_?P?P_ 0 -cell $_DFFSR_?PP_ 0 -cell $_DFFSRE_?PPP_ 0 -cell
-                //    $_DLATCH_PPP_ 0");
+                //    $_DLATCH_SRPPP_ 0");
             } else {
                 run("dfflegalize -cell $_DFF_P_ 0 -cell $_DFF_P??_ 0 -cell $_DFF_N_ 0 -cell $_DFF_N??_ 0 -cell $_DFFSR_???_ 0");
             }
diff --git a/ql-qlf-plugin/tests/Makefile b/ql-qlf-plugin/tests/Makefile
index b64701e..46587e7 100644
--- a/ql-qlf-plugin/tests/Makefile
+++ b/ql-qlf-plugin/tests/Makefile
@@ -5,6 +5,8 @@
 	shreg \
 	iob_no_flatten \
 	full_adder \
+	mac_unit \
+	multiplier \
 	logic
 
 include $(shell pwd)/../../Makefile_test.common
@@ -14,4 +16,6 @@
 iob_no_flatten_verify = true
 latches_verify = true
 full_adder_verify = true
+mac_unit_verify = true
+multiplier_verify = true
 logic_verify = true
diff --git a/ql-qlf-plugin/tests/mac_unit/mac_unit.tcl b/ql-qlf-plugin/tests/mac_unit/mac_unit.tcl
new file mode 100644
index 0000000..5028d41
--- /dev/null
+++ b/ql-qlf-plugin/tests/mac_unit/mac_unit.tcl
@@ -0,0 +1,22 @@
+yosys -import
+if { [info procs synth_quicklogic] == {} } { plugin -i ql-qlf}
+yosys -import  ;# ingest plugin commands
+
+set TOP "mac_unit"
+read_verilog $::env(DESIGN_TOP).v
+design -save read
+
+#Infer QL_DSP
+hierarchy -top $TOP
+synth_quicklogic -family qlf_k6n10 -top $TOP
+yosys cd $TOP
+stat
+select -assert-count 1 t:QL_DSP
+
+#Test no_dsp arg
+design -load read
+hierarchy -top $TOP
+synth_quicklogic -family qlf_k6n10 -top $TOP -no_dsp
+yosys cd $TOP
+stat
+select -assert-count 0 t:QL_DSP
diff --git a/ql-qlf-plugin/tests/mac_unit/mac_unit.v b/ql-qlf-plugin/tests/mac_unit/mac_unit.v
new file mode 100644
index 0000000..bcec450
--- /dev/null
+++ b/ql-qlf-plugin/tests/mac_unit/mac_unit.v
@@ -0,0 +1,8 @@
+module mac_unit(a, b, out);
+  parameter DATA_WIDTH = 16;
+  input [DATA_WIDTH - 1 : 0] a, b;
+  output [2*DATA_WIDTH - 1 : 0] out;
+
+  assign out = a * b + out;
+endmodule
+
diff --git a/ql-qlf-plugin/tests/multiplier/multiplier.tcl b/ql-qlf-plugin/tests/multiplier/multiplier.tcl
new file mode 100644
index 0000000..762bac6
--- /dev/null
+++ b/ql-qlf-plugin/tests/multiplier/multiplier.tcl
@@ -0,0 +1,22 @@
+yosys -import
+if { [info procs synth_quicklogic] == {} } { plugin -i ql-qlf}
+yosys -import  ;# ingest plugin commands
+
+set TOP "mult16x16"
+read_verilog $::env(DESIGN_TOP).v
+design -save read
+
+#Infer QL_DSP
+hierarchy -top $TOP
+synth_quicklogic -family qlf_k6n10 -top $TOP
+yosys cd $TOP
+stat
+select -assert-count 1 t:QL_DSP
+
+#Test no_dsp arg
+design -load read
+hierarchy -top $TOP
+synth_quicklogic -family qlf_k6n10 -top $TOP -no_dsp
+yosys cd $TOP
+stat
+select -assert-count 0 t:QL_DSP
diff --git a/ql-qlf-plugin/tests/multiplier/multiplier.v b/ql-qlf-plugin/tests/multiplier/multiplier.v
new file mode 100644
index 0000000..70f9a23
--- /dev/null
+++ b/ql-qlf-plugin/tests/multiplier/multiplier.v
@@ -0,0 +1,7 @@
+module mult16x16(a, b, out);
+  parameter DATA_WIDTH = 16;
+  input [DATA_WIDTH - 1 : 0] a, b;
+  output [2*DATA_WIDTH - 1 : 0] out;
+
+  assign out = a * b;
+endmodule