ZipCPU/mpygen.cpp

## mpygen.cpp
/////////////////////////////////////////////////////////////////////////////
//
// Filename: 	mpygen.cpp
//
// Project:	A simple, better, multiply generator
//
// Purpose:	This is the product of studying the signed, two's complement,
//		multiply operation presented on wikipedia,
//
//		http://en.wikipedia.org/wiki/Binary_multiplier
//
//		Basically, this file supports the routine, bestmpy, which
//		builds a multiply for any pair of bitwidths.  While I
//		didn't really want to build this in a coregen fashion,
//		such as you have here, the multiply operator kept changing
//		depending upon the bitwidth.  I can handle such changes in
//		C++, so ... thus I have computer generated multiply code.
//		As a special benefit, the number of clock cycles that
//		this routine takes is given by ceiling of the base two
//		logarithm of the number of bits in the greater operand,
//		plus one more.  Thus a 16x20 multiply requires 6 clocks,
//		whereas a 64x96 bit multiply (if you could build it), would
//		require 8 clocks.  Not bad, huh?  Oh, and the memory
//		requirements get smaller at each stage, as opposed to the
//		shiftadd multiply where the requirements were constant on
//		a per stage level, and often many more than I needed.
//
//		The basic operation is to create a "tableau" of words to
//		add together.  This is my term for the result of all of the
//		one bit multiplies (i_a[x] * i_b), and often written out
//		between two horizontal lines while doing long division.
//		This "tableau" is then added together to create the result.
//		Since so many lines need to be added together, we add two
//		lines at a time, and leave no line resting.  Therefore,
//		every even line gets added to an odd line at every clock
//		to create a new tableau that is roughly half the size of
//		the previous.  The operation then repeats and so forth.
//
//		I'm not sure I can explain the signed arithmetic extensions,
//		but that's what adds the extra 1's into the tableau and what
//		ends up complementing various bits along the way.
//
// Creator:	Dan Gisselquist, Ph.D.
//		Gisselquist Tecnology, LLC
//
///////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2015, Gisselquist Technology, LLC
//
// This program is free software (firmware): you can redistribute it and/or
// modify it under the terms of  the GNU General Public License as published
// by the Free Software Foundation, either version 3 of the License, or (at
// your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program.  (It's in the $(ROOT)/doc directory, run make with no
// target there if the PDF file isn't present.)  If not, see
// <http://www.gnu.org/licenses/> for a copy.
//
// License:	GPL, v3, as defined and found on www.gnu.org,
//		http://www.gnu.org/licenses/gpl.html
//
//
///////////////////////////////////////////////////////////////////////////
//
const char	cpyleft[] =
"///////////////////////////////////////////////////////////////////////////\n"
"//\n"
"// Copyright (C) 2015, Gisselquist Technology, LLC\n"
"//\n"
"// This program is free software (firmware): you can redistribute it and/or\n"
"// modify it under the terms of  the GNU General Public License as published\n"
"// by the Free Software Foundation, either version 3 of the License, or (at\n"
"// your option) any later version.\n"
"//\n"
"// This program is distributed in the hope that it will be useful, but WITHOUT\n"
"// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or\n"
"// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License\n"
"// for more details.\n"
"//\n"
"// You should have received a copy of the GNU General Public License along\n"
"// with this program.  If not, see <http://www.gnu.org/licenses/> for a\n"
"// copy.\n"
"//\n"
"// License:	GPL, v3, as defined and found on www.gnu.org,\n"
"//		http://www.gnu.org/licenses/gpl.html\n"
"//\n"
"//\n"
"///////////////////////////////////////////////////////////////////////////\n";
#include <stdio.h>
#include <stdlib.h>

const char	prjname[] = "A simple, better, multiply generator";
const char	creator[] =	"// Creator:	Dan Gisselquist, Ph.D.\n"
				"//		Gisselquist Tecnology, LLC\n";

void	buildmpy(FILE *fp, char *name, int na, int nb, bool aux) {
	int	row, clock, nbits, nrows, nzros, maxbits = na+nb;

	fprintf(fp,
"///////////////////////////////////////////////////////////////////////////\n"
"//\n"
"// Filename: 	%s.v\n"
"//		\n"
"// Project:	%s\n"
"//\n"
"// Purpose:\tThis verilog file multiplies two numbers together, without\n"
"// \t\tusing any hardware acceleration.  The file itself is\n"
"// \t\tcomputer generated, so please (for your sake) don\'t\n"
"// \t\tmake any edits to the file lest you regenerate it and\n"
"// \t\tthey be lost.\n"
"//\n"
"//\n%s"
"//\n",
		name, prjname, creator);

	fprintf(fp, "%s", cpyleft);

	if (nb < na) {
		fprintf(fp, "module %s(i_clk%s, i_ce, i_b, i_a%s, o_p%s);\n",
			name, (aux)?", i_rst":"",
			(aux)?", i_aux":"", (aux)?", o_aux":"");
		fprintf(fp,
			"\tparameter\tNA=%d, NB=%d;\n"
			"\tinput\t\t\t\t\ti_clk%s, i_ce;\n"
			"\tinput\t\tsigned\t[(NB-1):0]\ti_b;\n"
			"\tinput\t\tsigned\t[(NA-1):0]\ti_a;\n", nb, na,
			(aux)?", i_rst":"");
		int tmp = na; na = nb; nb = tmp;
	} else {
		fprintf(fp, "module %s(i_clk%s, i_ce, i_a, i_b%s, o_p%s);\n",
			name, (aux)?", i_rst":"",
			(aux)?", i_aux":"", (aux)?", o_aux":"");
		fprintf(fp,
			"\tparameter\tNA=%d, NB=%d;\n"
			"\tinput\t\t\t\t\ti_clk%s, i_ce;\n"
			"\tinput\t\tsigned\t[(NA-1):0]\ti_a;\n"
			"\tinput\t\tsigned\t[(NB-1):0]\ti_b;\n", na, nb,
			(aux)?", i_rst":"");
	} if (aux) fprintf(fp, "\tinput\t\t\t\t\ti_aux;\n");
	fprintf(fp, "\toutput\treg\tsigned\t[(NA+NB-1):0]\to_p;\n");
	if (aux) fprintf(fp, "\toutput\treg\t\t\t\to_aux;\n");
	fprintf(fp, "\n");

	// Build the first tableau row
	// There are Na elements, each of Nb length
	clock = 0;
	fprintf(fp, "\t// Clock zero: build our Tableau only.\n"
		"\t// There will be one row for every bit in i_a, and each\n"
		"\t// row will contain one more bit than i_b, to allow\n"
		"\t// for signed arithmetic manipulation.\n"
		"\t//\n"
		"\t// Perhaps calling this \'clock zero\' is a misnomer.  The term\n"
		"\t// is left over from an earlier version where this section was\n"
		"\t// truly clocked.  Now, we leave this as unclocked conditional\n"
		"\t// logic, since doing the first addition in logic fits within\n"
		"\t// LUTs without needing the additional pair of flip flops for \n"
		"\t// every row that would otherwise be needed.\n\t//\n");
	for(row=0; row<nb; row++)
		fprintf(fp, "\twire\t[NB:0]\tS_%d_%02d;\n", clock, row);
	if (aux) fprintf(fp, "\twire\tA_%d;\n", clock);
	fprintf(fp, "\n");
	fprintf(fp, "\tassign\tS_%d_%02d = { 1\'b1, {(i_a[%d])?(~i_b[NB-1]):1\'b1},\n\t\t\t\t\t(i_a[%d])?i_b[NB-2:0]:{(NB-1){1\'b0}} };\n", clock, 0, 0, 0);
	for(row=1; row<na-1; row++)
		fprintf(fp, "\tassign\tS_%d_%02d = { 1\'b0, ((i_a[%d])?(~i_b[(NB-1)]):1\'b1),\n\t\t\t\t\t((i_a[%d]) ? i_b[(NB-2):0] : {(NB-1){1\'b0}}) };\n", clock, row, row, row);
	for(row=na-1; row<nb-1; row++)
		fprintf(fp, "\tassign\tS_%d_%02d = { 1\'b0, ((i_a[%d])?(~i_b[(NB-1)]):1\'b1),\n\t\t\t\t\t((i_a[%d]) ? i_b[(NB-2):0] : {(NB-1){1\'b0}}) };\n", clock, row, na-1, na-1);
	fprintf(fp, "\tassign\tS_%d_%02d = { 1\'b1, ((i_a[%d])?i_b[NB-1]:1\'b0),\n\t\t\t\t\t((i_a[%d])?(~i_b[(NB-2):0]):{(NB-1){1\'b1}}) };\n", clock, row, na-1, na-1);
	if (aux)
		fprintf(fp, "\tassign\tA_%d = i_aux;\n", clock);
	nrows = nb; nbits = nb+1, nzros=1;

	while(nrows > 1) {
		fprintf(fp, "\n\t//\n\t// Round #%d, clock = %d, nz = %d, nbits = %d, nrows = %d\n",
			clock, clock, nzros, nbits, nrows);
		clock++; fprintf(fp, "\n");
		for(row=0; row<(nrows+1)/2; row++) {
			fprintf(fp, "\treg\t[(%d-1):0]\tS_%d_%02d;\n",
				((nbits+1+nzros)>maxbits)?maxbits
					:(nbits+1+nzros), clock, row);
		}
		if (aux) fprintf(fp, "\treg\tA_%d;\n", clock);

		fprintf(fp, "\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_ce)\n"
"\t\tbegin\n");
		for(row=0; row<nrows/2; row++) {
			fprintf(fp, "\t\t\tS_%d_%02d <= { ", clock, row);
			if (maxbits-nbits>0) {
				if (nzros+nbits+1 > maxbits)
					fprintf(fp, "%d\'b0, ", maxbits-nbits);
				else
					fprintf(fp, "%d\'b0, ", nzros);
				fprintf(fp, "S_%d_%02d } + { S_%d_%02d",
					clock-1, 2*row, clock-1, 2*row+1);
			} else {
				fprintf(fp, "S_%d_%02d[%d:0] } + { S_%d_%02d",
					clock-1, 2*row,
					maxbits-1,
					clock-1, 2*row+1);
			}
			if (nzros+nbits+1 > maxbits)
				fprintf(fp, "[%d:0]", maxbits-1-nzros);
			fprintf(fp, ", %d\'b0 };\n", nzros);
		}
		if (nrows&1) {
			fprintf(fp, "\t\t\tS_%d_%02d <= { ",
				clock, row);
			if (nzros+nbits+1 > maxbits)
				fprintf(fp, "%d\'b0, ", maxbits-nbits);
			else
				fprintf(fp, "%d\'b0, ", nzros+1);
			fprintf(fp, "S_%d_%02d };\n", clock-1, nrows-1);
		}
		fprintf(fp, "\t\tend\n");
		if (aux)
		fprintf(fp, "\talways @(posedge i_clk)\n\t\tif (i_rst)\n"
		"\t\t\tA_%d <= 1'b0;\n"
		"\t\telse if (i_ce)\n"
		"\t\t\tA_%d <= A_%d;\n", clock, clock, clock-1);

		nrows = (nrows+1)/2;
		nbits+=1+nzros; nzros<<= 1;
	}

	// The full multiply is complete, just clock our outputs
	// to values we've already calculated.
	if (true) { // Register the results, requiring an extra clock
		fprintf(fp, "\n\talways @(posedge i_clk)\n\t\tif (i_ce)\n");
		fprintf(fp, "\t\t\to_p <= S_%d_00[(NA+NB-1):0];\n", clock);
		if (aux) {
			fprintf(fp, "\n"
			"\talways @(posedge i_clk)\n"
			"\t\tif (i_rst)\n"
			"\t\t\to_aux <= 1\'b0;\n"
			"\t\telse if (i_ce)\n"
			"\t\t\to_aux <= A_%d;\n", clock);
		}
	} else { // Just do a wire select to capture our results
		fprintf(fp, "\n\tassign\to_p = S_%d_00[(NA+NB-1):0];\n", clock);
		if (aux) fprintf(fp, "\tassign\to_aux = A_%d;\n", clock);
	}

	fprintf(fp, "\nendmodule\n");
}

void	buildmpy(int Na, int Nb, bool use_aux) {
	FILE	*fp;
	char	fname[256];

	sprintf(fname, "sgnmpy_%dx%d.v", Na, Nb);
	fp = fopen(fname, "w");
	if (!fp) {
		fprintf(stderr, "Could not open %s for writing\n", fname);
		perror("O/S Err:");
		exit(-1);
	}

	sprintf(fname, "sgnmpy_%dx%d", Na, Nb);
	buildmpy(fp, fname, Na, Nb, use_aux);

	fclose(fp);
}

int main(int argc, char **argv) {
	bool	use_aux = true;

	buildmpy(atoi(argv[1]), atoi(argv[2]), use_aux);

	return(0);
}
	/////////////////////////////////////////////////////////////////////////////
	//
	// Filename: mpygen.cpp
	//
	// Project: A simple, better, multiply generator
	//
	// Purpose: This is the product of studying the signed, two's complement,
	// multiply operation presented on wikipedia,
	//
	// http://en.wikipedia.org/wiki/Binary_multiplier
	//
	// Basically, this file supports the routine, bestmpy, which
	// builds a multiply for any pair of bitwidths. While I
	// didn't really want to build this in a coregen fashion,
	// such as you have here, the multiply operator kept changing
	// depending upon the bitwidth. I can handle such changes in
	// C++, so ... thus I have computer generated multiply code.
	// As a special benefit, the number of clock cycles that
	// this routine takes is given by ceiling of the base two
	// logarithm of the number of bits in the greater operand,
	// plus one more. Thus a 16x20 multiply requires 6 clocks,
	// whereas a 64x96 bit multiply (if you could build it), would
	// require 8 clocks. Not bad, huh? Oh, and the memory
	// requirements get smaller at each stage, as opposed to the
	// shiftadd multiply where the requirements were constant on
	// a per stage level, and often many more than I needed.
	//
	// The basic operation is to create a "tableau" of words to
	// add together. This is my term for the result of all of the
	// one bit multiplies (i_a[x] * i_b), and often written out
	// between two horizontal lines while doing long division.
	// This "tableau" is then added together to create the result.
	// Since so many lines need to be added together, we add two
	// lines at a time, and leave no line resting. Therefore,
	// every even line gets added to an odd line at every clock
	// to create a new tableau that is roughly half the size of
	// the previous. The operation then repeats and so forth.
	//
	// I'm not sure I can explain the signed arithmetic extensions,
	// but that's what adds the extra 1's into the tableau and what
	// ends up complementing various bits along the way.
	//
	// Creator: Dan Gisselquist, Ph.D.
	// Gisselquist Tecnology, LLC
	//
	///////////////////////////////////////////////////////////////////////////
	//
	// Copyright (C) 2015, Gisselquist Technology, LLC
	//
	// This program is free software (firmware): you can redistribute it and/or
	// modify it under the terms of the GNU General Public License as published
	// by the Free Software Foundation, either version 3 of the License, or (at
	// your option) any later version.
	//
	// This program is distributed in the hope that it will be useful, but WITHOUT
	// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
	// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	// for more details.
	//
	// You should have received a copy of the GNU General Public License along
	// with this program. (It's in the $(ROOT)/doc directory, run make with no
	// target there if the PDF file isn't present.) If not, see
	// <http://www.gnu.org/licenses/> for a copy.
	//
	// License: GPL, v3, as defined and found on www.gnu.org,
	// http://www.gnu.org/licenses/gpl.html
	//
	//
	///////////////////////////////////////////////////////////////////////////
	//
	const char cpyleft[] =
	"///////////////////////////////////////////////////////////////////////////\n"
	"//\n"
	"// Copyright (C) 2015, Gisselquist Technology, LLC\n"
	"//\n"
	"// This program is free software (firmware): you can redistribute it and/or\n"
	"// modify it under the terms of the GNU General Public License as published\n"
	"// by the Free Software Foundation, either version 3 of the License, or (at\n"
	"// your option) any later version.\n"
	"//\n"
	"// This program is distributed in the hope that it will be useful, but WITHOUT\n"
	"// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or\n"
	"// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n"
	"// for more details.\n"
	"//\n"
	"// You should have received a copy of the GNU General Public License along\n"
	"// with this program. If not, see <http://www.gnu.org/licenses/> for a\n"
	"// copy.\n"
	"//\n"
	"// License: GPL, v3, as defined and found on www.gnu.org,\n"
	"// http://www.gnu.org/licenses/gpl.html\n"
	"//\n"
	"//\n"
	"///////////////////////////////////////////////////////////////////////////\n";
	#include <stdio.h>
	#include <stdlib.h>

	const char prjname[] = "A simple, better, multiply generator";
	const char creator[] = "// Creator: Dan Gisselquist, Ph.D.\n"
	"// Gisselquist Tecnology, LLC\n";

	void buildmpy(FILE fp, char name, int na, int nb, bool aux) {
	int row, clock, nbits, nrows, nzros, maxbits = na+nb;

	fprintf(fp,
	"///////////////////////////////////////////////////////////////////////////\n"
	"//\n"
	"// Filename: %s.v\n"
	"// \n"
	"// Project: %s\n"
	"//\n"
	"// Purpose:\tThis verilog file multiplies two numbers together, without\n"
	"// \t\tusing any hardware acceleration. The file itself is\n"
	"// \t\tcomputer generated, so please (for your sake) don\'t\n"
	"// \t\tmake any edits to the file lest you regenerate it and\n"
	"// \t\tthey be lost.\n"
	"//\n"
	"//\n%s"
	"//\n",
	name, prjname, creator);

	fprintf(fp, "%s", cpyleft);

	if (nb < na) {
	fprintf(fp, "module %s(i_clk%s, i_ce, i_b, i_a%s, o_p%s);\n",
	name, (aux)?", i_rst":"",
	(aux)?", i_aux":"", (aux)?", o_aux":"");
	fprintf(fp,
	"\tparameter\tNA=%d, NB=%d;\n"
	"\tinput\t\t\t\t\ti_clk%s, i_ce;\n"
	"\tinput\t\tsigned\t[(NB-1):0]\ti_b;\n"
	"\tinput\t\tsigned\t[(NA-1):0]\ti_a;\n", nb, na,
	(aux)?", i_rst":"");
	int tmp = na; na = nb; nb = tmp;
	} else {
	fprintf(fp, "module %s(i_clk%s, i_ce, i_a, i_b%s, o_p%s);\n",
	name, (aux)?", i_rst":"",
	(aux)?", i_aux":"", (aux)?", o_aux":"");
	fprintf(fp,
	"\tparameter\tNA=%d, NB=%d;\n"
	"\tinput\t\t\t\t\ti_clk%s, i_ce;\n"
	"\tinput\t\tsigned\t[(NA-1):0]\ti_a;\n"
	"\tinput\t\tsigned\t[(NB-1):0]\ti_b;\n", na, nb,
	(aux)?", i_rst":"");
	} if (aux) fprintf(fp, "\tinput\t\t\t\t\ti_aux;\n");
	fprintf(fp, "\toutput\treg\tsigned\t[(NA+NB-1):0]\to_p;\n");
	if (aux) fprintf(fp, "\toutput\treg\t\t\t\to_aux;\n");
	fprintf(fp, "\n");

	// Build the first tableau row
	// There are Na elements, each of Nb length
	clock = 0;
	fprintf(fp, "\t// Clock zero: build our Tableau only.\n"
	"\t// There will be one row for every bit in i_a, and each\n"
	"\t// row will contain one more bit than i_b, to allow\n"
	"\t// for signed arithmetic manipulation.\n"
	"\t//\n"
	"\t// Perhaps calling this \'clock zero\' is a misnomer. The term\n"
	"\t// is left over from an earlier version where this section was\n"
	"\t// truly clocked. Now, we leave this as unclocked conditional\n"
	"\t// logic, since doing the first addition in logic fits within\n"
	"\t// LUTs without needing the additional pair of flip flops for \n"
	"\t// every row that would otherwise be needed.\n\t//\n");
	for(row=0; row<nb; row++)
	fprintf(fp, "\twire\t[NB:0]\tS_%d_%02d;\n", clock, row);
	if (aux) fprintf(fp, "\twire\tA_%d;\n", clock);
	fprintf(fp, "\n");
	fprintf(fp, "\tassign\tS_%d_%02d = { 1\'b1, {(i_a[%d])?(~i_b[NB-1]):1\'b1},\n\t\t\t\t\t(i_a[%d])?i_b[NB-2:0]:{(NB-1){1\'b0}} };\n", clock, 0, 0, 0);
	for(row=1; row<na-1; row++)
	fprintf(fp, "\tassign\tS_%d_%02d = { 1\'b0, ((i_a[%d])?(~i_b[(NB-1)]):1\'b1),\n\t\t\t\t\t((i_a[%d]) ? i_b[(NB-2):0] : {(NB-1){1\'b0}}) };\n", clock, row, row, row);
	for(row=na-1; row<nb-1; row++)
	fprintf(fp, "\tassign\tS_%d_%02d = { 1\'b0, ((i_a[%d])?(~i_b[(NB-1)]):1\'b1),\n\t\t\t\t\t((i_a[%d]) ? i_b[(NB-2):0] : {(NB-1){1\'b0}}) };\n", clock, row, na-1, na-1);
	fprintf(fp, "\tassign\tS_%d_%02d = { 1\'b1, ((i_a[%d])?i_b[NB-1]:1\'b0),\n\t\t\t\t\t((i_a[%d])?(~i_b[(NB-2):0]):{(NB-1){1\'b1}}) };\n", clock, row, na-1, na-1);
	if (aux)
	fprintf(fp, "\tassign\tA_%d = i_aux;\n", clock);
	nrows = nb; nbits = nb+1, nzros=1;

	while(nrows > 1) {
	fprintf(fp, "\n\t//\n\t// Round #%d, clock = %d, nz = %d, nbits = %d, nrows = %d\n",
	clock, clock, nzros, nbits, nrows);
	clock++; fprintf(fp, "\n");
	for(row=0; row<(nrows+1)/2; row++) {
	fprintf(fp, "\treg\t[(%d-1):0]\tS_%d_%02d;\n",
	((nbits+1+nzros)>maxbits)?maxbits
	:(nbits+1+nzros), clock, row);
	}
	if (aux) fprintf(fp, "\treg\tA_%d;\n", clock);

	fprintf(fp, "\n"
	"\talways @(posedge i_clk)\n"
	"\t\tif (i_ce)\n"
	"\t\tbegin\n");
	for(row=0; row<nrows/2; row++) {
	fprintf(fp, "\t\t\tS_%d_%02d <= { ", clock, row);
	if (maxbits-nbits>0) {
	if (nzros+nbits+1 > maxbits)
	fprintf(fp, "%d\'b0, ", maxbits-nbits);
	else
	fprintf(fp, "%d\'b0, ", nzros);
	fprintf(fp, "S_%d_%02d } + { S_%d_%02d",
	clock-1, 2row, clock-1, 2row+1);
	} else {
	fprintf(fp, "S_%d_%02d[%d:0] } + { S_%d_%02d",
	clock-1, 2*row,
	maxbits-1,
	clock-1, 2*row+1);
	}
	if (nzros+nbits+1 > maxbits)
	fprintf(fp, "[%d:0]", maxbits-1-nzros);
	fprintf(fp, ", %d\'b0 };\n", nzros);
	}
	if (nrows&1) {
	fprintf(fp, "\t\t\tS_%d_%02d <= { ",
	clock, row);
	if (nzros+nbits+1 > maxbits)
	fprintf(fp, "%d\'b0, ", maxbits-nbits);
	else
	fprintf(fp, "%d\'b0, ", nzros+1);
	fprintf(fp, "S_%d_%02d };\n", clock-1, nrows-1);
	}
	fprintf(fp, "\t\tend\n");
	if (aux)
	fprintf(fp, "\talways @(posedge i_clk)\n\t\tif (i_rst)\n"
	"\t\t\tA_%d <= 1'b0;\n"
	"\t\telse if (i_ce)\n"
	"\t\t\tA_%d <= A_%d;\n", clock, clock, clock-1);

	nrows = (nrows+1)/2;
	nbits+=1+nzros; nzros<<= 1;
	}

	// The full multiply is complete, just clock our outputs
	// to values we've already calculated.
	if (true) { // Register the results, requiring an extra clock
	fprintf(fp, "\n\talways @(posedge i_clk)\n\t\tif (i_ce)\n");
	fprintf(fp, "\t\t\to_p <= S_%d_00[(NA+NB-1):0];\n", clock);
	if (aux) {
	fprintf(fp, "\n"
	"\talways @(posedge i_clk)\n"
	"\t\tif (i_rst)\n"
	"\t\t\to_aux <= 1\'b0;\n"
	"\t\telse if (i_ce)\n"
	"\t\t\to_aux <= A_%d;\n", clock);
	}
	} else { // Just do a wire select to capture our results
	fprintf(fp, "\n\tassign\to_p = S_%d_00[(NA+NB-1):0];\n", clock);
	if (aux) fprintf(fp, "\tassign\to_aux = A_%d;\n", clock);
	}

	fprintf(fp, "\nendmodule\n");
	}

	void buildmpy(int Na, int Nb, bool use_aux) {
	FILE *fp;
	char fname[256];

	sprintf(fname, "sgnmpy_%dx%d.v", Na, Nb);
	fp = fopen(fname, "w");
	if (!fp) {
	fprintf(stderr, "Could not open %s for writing\n", fname);
	perror("O/S Err:");
	exit(-1);
	}

	sprintf(fname, "sgnmpy_%dx%d", Na, Nb);
	buildmpy(fp, fname, Na, Nb, use_aux);

	fclose(fp);
	}

	int main(int argc, char **argv) {
	bool use_aux = true;

	buildmpy(atoi(argv[1]), atoi(argv[2]), use_aux);

	return(0);
	}