tueda/formset.py

## formset.py
#!/bin/sh
""":" .

exec python "$0" "$@"
"""

from __future__ import print_function

import argparse
import contextlib
import copy
import math
import os
import re
import subprocess
import sys

__doc__ = """\
Generate form.set suited for the local machine.

Example
-------
$ formset.py -o
$ tform `formset.py -f` calcdia.frm
$ minos `formset.py -m` minos.file

Python versions
---------------
2.7, 3.2, 3.3, 3.4, 3.5
"""


if 'check_output' not in dir(subprocess):
    # For old systems where Python 2.6 + argparse available.
    def check_output(*popenargs, **kwargs):
        """Run a command."""
        if 'stdout' in kwargs:  # pragma: no cover
            raise ValueError('stdout argument not allowed, '
                             'it will be overridden.')
        process = subprocess.Popen(stdout=subprocess.PIPE,
                                   *popenargs, **kwargs)
        output, _ = process.communicate()
        retcode = process.poll()
        if retcode:
            cmd = kwargs.get('args')
            if cmd is None:
                cmd = popenargs[0]
            # `output` keyword is not available in 2.6.
            raise subprocess.CalledProcessError(retcode, cmd)
        return output
    subprocess.check_output = check_output


@contextlib.contextmanager
def open_w_or_stdout(filename=None):
    """Context manager for a file or stdout."""
    if filename:
        # See https://stackoverflow.com/a/2333979.
        tmpfilename = '{0}.tmp{1}'.format(filename, os.getpid())
        f = open(tmpfilename, 'w')
        try:
            yield f
        finally:
            f.flush()
            os.fsync(f.fileno())
            f.close()
            os.rename(tmpfilename, filename)
    else:
        yield sys.stdout


def round_down(x, n):
    """Round down `x` to nearest `n`."""
    return x // n * n


def round_up(x, n):
    """Round up `x` to nearest `n`."""
    return (x + (n - 1)) // n * n


def metric_prefix(s):
    """Parse a metric prefix as a number."""
    s = s.lower()
    if s == '':
        return 1
    if s == 'k':
        return 1000
    if s == 'm':
        return 1000**2
    if s == 'g':
        return 1000**3
    if s == 't':
        return 1000**4
    return None


def parse_number(s):
    """Parse a string as a number with a possible metric prefix."""
    scale = 1
    m = re.match(r'(.*)([kmgtKMGT])$', s)
    if m:
        s = m.group(1)
        scale = metric_prefix(m.group(2))
    # May raise ValueError.
    return int(float(s) * scale)


def round_human_readable(x, up=False, tostring=True):
    """Round off `x` within a human readable form."""
    round_off = round_up if up else round_down
    # Take 3 significant figures.
    n = 10**(int(math.floor(math.log10(x))) - 2)
    x = round_off(x, n)
    # Find a good suffix which doesn't change the value.
    xx = round_off(x, 1000**4)
    if xx == x:
        return '{0}T'.format(xx // 1000**4) if tostring else xx
    xx = round_off(x, 1000**3)
    if xx == x:
        return '{0}G'.format(xx // 1000**3) if tostring else xx
    xx = round_off(x, 1000**2)
    if xx == x:
        return '{0}M'.format(xx // 1000**2) if tostring else xx
    xx = round_off(x, 1000)
    if xx == x:
        return '{0}K'.format(xx // 1000) if tostring else xx
    return x


class classproperty(property):  # noqa
    """Decorator to make a property of a class."""

    def __get__(self, cls, owner):
        """Getter."""
        return classmethod(self.fget).__get__(None, owner)()


class SystemInfo(object):
    """System information."""

    _cpu_info = None
    _mem_info = None

    verbose = False

    @classproperty
    def number_of_nodes(cls):  # noqa
        """Return the number of nodes."""
        info = cls._get_cpu_info()
        if 'NUMA node(s)' in info:
            return int(info['NUMA node(s)'])
        else:
            return 1

    @classproperty
    def number_of_cpus(cls):  # noqa
        """Return the number of cpus."""
        info = cls._get_cpu_info()
        return int(info['CPU(s)'])

    @classproperty
    def number_of_physical_cores(cls):  # noqa
        """Return the number of physical cores."""
        info = cls._get_cpu_info()
        return int(info['Socket(s)']) * int(info['Core(s) per socket'])

    @classproperty
    def total_memory(cls):  # noqa
        """Return the total physical memory in bytes."""
        info = cls._get_mem_info()
        return int(info['Mem'][0])

    @classmethod
    def _get_cpu_info(cls):
        if cls._cpu_info is None:
            if cls.verbose:
                sys.stderr.write('running lscpu...\n')
            info = subprocess.check_output(['lscpu'])
            info = info.decode('utf-8')
            info = info.strip().split('\n')
            info = [[ss.strip() for ss in s.split(':')] for s in info]
            info = dict(info)
            cls._cpu_info = info
        return cls._cpu_info

    @classmethod
    def _get_mem_info(cls):
        if cls._mem_info is None:
            if cls.verbose:
                sys.stderr.write('running free...\n')
            info = subprocess.check_output(['free', '-b'])
            info = info.decode('utf-8')
            info = info.strip().split('\n')
            info = [[ss.strip() for ss in s.split(':')] for s in info]
            info = [s for s in info if len(s) == 2]
            info = [[s[0], s[1].split()] for s in info]
            info = dict(info)
            cls._mem_info = info
        return cls._mem_info


class Setup(object):
    """Setup parameters."""

    def __init__(self):
        """Construct a set of setup parameters."""
        self.compresssize = 90000
        self.filepatches = 256
        self.hidesize = 0
        self.largepatches = 256
        self.largesize = 50000000
        self.maxtermsize = 40000  # 64-bit
        self.numstorecaches = 4
        self.scratchsize = 50000000
        self.sizestorecache = 32768
        self.smallextension = 20000000
        self.smallsize = 10000000
        self.sortiosize = 100000
        self.termsinsmall = 100000
        self.threadbucketsize = 500
        self.threads = -1  # form
        self.threadscratchoutsize = 2500000
        self.threadscratchsize = 100000
        self.workspace = 40000000  # 64-bit

        self.bracketindexsize = 200000
        self.constindex = 128
        self.continuationlines = 15
        self.functionlevels = 30
        self.maxnumbersize = 200
        self.maxwildcards = 100
        self.parentheses = 100
        self.processbucketsize = 1000
        self.subfilepatches = 64
        self.sublargepatches = 64
        self.sublargesize = 4000000
        self.subsmallextension = 800000
        self.subsmallsize = 500000
        self.subsortiosize = 32768
        self.subtermsinsmall = 10000

        # 64-bit
        self._ptrsize = 8
        self._possize = 8
        self._wordsize = 4

    def items(self):
        """Return pairs of parameters and values."""
        items = [(k, v) for (k, v) in self.__dict__.items() if k[0] != '_']
        items.sort()
        return tuple(items)

    def __str__(self):
        """Return the string representaiton."""
        mem = self.calc()
        params = ['{0}: {1}'.format(k, v) for (k, v) in self.items()]
        return '<Setup: {0} bytes, {1}>'.format(mem, ', '.join(params))

    def copy(self):
        """Return a shallow copy."""
        return copy.copy(self)

    def calc(self):
        """Return an estimation of memory usage."""
        self.maxtermsize = max(self.maxtermsize, 200)

        self.compresssize = max(self.compresssize,
                                2 * self.maxtermsize * self._wordsize)
        self.sortiosize = max(self.sortiosize,
                              self.maxtermsize * self._wordsize)

        # The strange factor WordSize**2 is used in the FORM source...
        self.scratchsize = max(self.scratchsize,
                               4 * self.maxtermsize * self._wordsize**2)
        if self.hidesize > 0:
            self.hidesize = max(self.hidesize,
                                4 * self.maxtermsize * self._wordsize**2)

        self.threadscratchsize = max(self.threadscratchsize,
                                     4 * self.maxtermsize * self._wordsize**2)
        self.threadscratchoutsize = max(self.threadscratchoutsize,
                                        4 * self.maxtermsize *
                                        self._wordsize**2)

        # constraints in RecalcSetups()

        self.filepatches = max(self.filepatches, self.threads)

        self.termsinsmall = round_up(self.termsinsmall, 16)

        numberofblocksinsort = 10
        minimumnumberofterms = 10
        n = numberofblocksinsort * minimumnumberofterms
        if self.threads >= 0:
            minbufsize = (self.threads * (1 + n) * self.maxtermsize *
                          self._wordsize)
            if self.largesize + self.smallextension < minbufsize:
                self.largesize = minbufsize - self.smallextension

        # constraints in AllocSort()

        self.filepatches = max(self.filepatches, 4)

        self.smallsize = max(self.smallsize,
                             16 * self.maxtermsize * self._wordsize)

        self.smallextension = max(self.smallextension, self.smallsize * 3 // 2)

        if self.largesize > 0:
            self.largesize = max(self.largesize, 2 * self.smallsize)

        compinc = 2
        minbufsize = self.filepatches * (self.sortiosize +
                                         (compinc + 2 * self.maxtermsize) *
                                         self._wordsize)
        if self.largesize + self.smallextension < minbufsize:
            if self.largesize == 0:
                self.smallextension = minbufsize
            else:
                self.largesize = minbufsize - self.smallextension

        iotry = (((self.largesize + self.smallextension) // self.filepatches //
                 self._wordsize) - 2 * self.maxtermsize - compinc)  # in words
        self.sortiosize = max(self.sortiosize, iotry)  # bytes vs. words??

        # Compute the memory usage.

        mem = 0
        mem += (self.scratchsize * 2 + (self.hidesize
                                        if self.hidesize > 0
                                        else self.scratchsize))
        mem += self.workspace * self._wordsize
        mem += (self.compresssize + 10) * self._wordsize
        mem += (self.largesize + self.smallextension + 3 * self.termsinsmall *
                self._ptrsize + self.sortiosize)

        storecachesize = self._possize * 2 * self._ptrsize + self._wordsize
        # ignore the padding
        storecachesize += self.sizestorecache
        mem += storecachesize * self.numstorecaches

        if self.threads >= 1:
            mem += ((self.threadscratchoutsize + self.threadscratchsize * 2) *
                    self.threads)
            mem += self.workspace * self._wordsize * self.threads
            mem += (self.compresssize + 10) * self._wordsize * self.threads

            mem += self._thread_alloc_sort(self.largesize // self.threads,
                                           self.smallsize // self.threads,
                                           self.smallextension // self.threads,
                                           self.termsinsmall,
                                           self.largepatches,
                                           self.filepatches // self.threads,
                                           self.sortiosize) * self.threads

            mem += storecachesize * self.numstorecaches * self.threads

            sizethreadbuckets = ((self.threadbucketsize + 1) *
                                 self.maxtermsize + 2) * self._wordsize
            if self.threadbucketsize >= 250:
                sizethreadbuckets //= 4
            elif self.threadbucketsize >= 90:
                sizethreadbuckets //= 3
            elif self.threadbucketsize >= 40:
                sizethreadbuckets //= 2
            sizethreadbuckets //= self._wordsize
            mem += ((2 * sizethreadbuckets * self._wordsize +
                    (self.threadbucketsize + 1) * self._possize) *
                    2 * self.threads)
            if self.threads >= 3:
                mem += ((self.workspace * self._wordsize // 8 +
                        2 * self.maxtermsize * self._wordsize) *
                        (self.threads - 2))

        return mem

    def _thread_alloc_sort(self, largesize, smallsize, smallextension,
                           termsinsmall, largepatches, filepatches,
                           sortiosize):

        filepatches = max(filepatches, 4)

        smallsize = max(smallsize, 16 * self.maxtermsize * self._wordsize)

        smallextension = max(smallextension, smallsize * 3 // 2)

        if largesize > 0:
            largesize = max(largesize, 2 * smallsize)

        compinc = 2
        minbufsize = filepatches * (sortiosize + (compinc +
                                    2 * self.maxtermsize) * self._wordsize)
        if largesize + smallextension < minbufsize:
            if largesize == 0:
                smallextension = minbufsize
            else:
                largesize = minbufsize - smallextension

        iotry = (((largesize + smallextension) // filepatches //
                 self._wordsize) - 2 * self.maxtermsize - compinc)  # in words
        sortiosize = max(sortiosize, iotry)  # bytes vs. words??

        return (largesize + smallextension + 3 * termsinsmall * self._ptrsize +
                sortiosize)


def main():
    """Entry point."""
    # Parse the command line arguments.
    parser = argparse.ArgumentParser(
        usage=('%(prog)s [options] [--] '
               '[par=val].. [par+=int].. [par*=float]..'),
        epilog=('On non-Linux systems, the number of physical CPUs and memory '
                'available on the machine may be not automatically detected. '
                'In such a case, one cannot use the default parameters '
                'depending on those values and needs to explicitly specify '
                '--ncpus, --total-cpus and --total-memory.'),
        add_help=False
    )
    parser.add_argument('-h',
                        '--help',
                        action='store_const',
                        const=True,
                        help='show this help message and exit')
    parser.add_argument('-o',
                        '--output',
                        action='store',
                        nargs='?',
                        const='form.set',
                        help=('output to FILE (default: no (stdout), '
                              'FILE=form.set)'),
                        metavar='FILE')
    parser.add_argument('-f',
                        '--form',
                        action='store_const',
                        const=True,
                        help='print tform options (e.g., -w4) and exit')
    parser.add_argument('-m',
                        '--minos',
                        action='store_const',
                        const=True,
                        help='print minos options (e.g., -m2x4) and exit')
    parser.add_argument('-u',
                        '--usage',
                        action='store_const',
                        const=True,
                        help='print expected initial memory usage and exit')
    parser.add_argument('-H',
                        '--human-readable',
                        action='store_const',
                        const=True,
                        help=('adjust to human-readable numbers '
                              '(e.g., 1K, 23M, 456G)'))
    parser.add_argument('-1',
                        '--one',
                        action='store_const',
                        const=-1,
                        dest='ncpus',
                        help='use cpus in a node on the machine (default)')
    parser.add_argument('--full',
                        action='store_const',
                        const=-99999,
                        dest='ncpus',
                        help='use cpus in all nodes on the machine')
    parser.add_argument('-n',
                        '--ncpus',
                        action='store',
                        type=int,
                        help='use N cpus',
                        metavar='N')
    parser.add_argument('-p',
                        '--percentage',
                        action='store',
                        default=75.0,
                        type=float,
                        help=('percentage of initial memory usage '
                              '(default: 75.0)'),
                        metavar='N')
    parser.add_argument('--total-cpus',
                        action='store',
                        type=int,
                        help='specify the total cpus on the machine',
                        metavar='N')
    parser.add_argument('--total-memory',
                        action='store',
                        help='specify the total memory on the machine',
                        metavar='N')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_const',
                        const=True,
                        help='verbose output')
    parser.add_argument('args',
                        nargs='*',
                        help=argparse.SUPPRESS)
    args = parser.parse_args()
    pars = {}

    # NOTE: when all of `--ncpus`, `--total-cpus` and `--total-memory` are
    # specified, we don't need to access the system information.

    if args.verbose:
        SystemInfo.verbose = True

    if args.total_cpus:
        total_cpus = args.total_cpus
    else:
        total_cpus = SystemInfo.number_of_physical_cores

    if args.total_memory:
        try:
            total_memory = parse_number(args.total_memory)
        except ValueError:
            parser.error('non-integer value for total memory: {0}'.format(
                args.total_memory))
    else:
        total_memory = SystemInfo.total_memory

    # Help message.
    if args.help:
        parser.print_help()
        exit(0)

    # Number of CPUs.
    if args.ncpus is not None:
        ncpus = args.ncpus
    else:
        # Use 1 node for each job by default.
        ncpus = -1
    if ncpus < 0:
        # Use (-ncpus) nodes.
        ncpus = -ncpus * (total_cpus // SystemInfo.number_of_nodes)
    ncpus = max(ncpus, 1)
    ncpus = min(ncpus, total_cpus)

    sp = Setup()
    sp.threads = ncpus if ncpus >= 2 else -1

    for a in args.args:
        m = re.match(r'([a-zA-Z][a-zA-Z0-9]*)([+*]?)=(.*)', a)
        if m:
            par = m.group(1).lower()
            ope = m.group(2)
            val = m.group(3)
            if par in sp.__dict__:
                # Known parameter.
                if ope == '' or ope == '+':
                    # We have par=val or par+=int.
                    try:
                        val = parse_number(val)
                    except ValueError:
                        parser.error(
                            'non-integer value for parameter: {0}'.format(a))
                    if ope == '':
                        setattr(sp, par, val)
                    else:
                        setattr(sp, par, getattr(sp, par) + val)
                    continue
                else:
                    # We have par*=float.
                    try:
                        val = float(val)
                    except ValueError:
                        parser.error(
                            'non-float value for parameter: {0}'.format(a))
                    setattr(sp, par, int(getattr(sp, par) * val))
                    continue
            elif ope == '':
                # Unknown parameter given by par=val. Add it to the dictionary.
                pars[par] = val
                continue
        parser.error('unrecognized argument: {0}'.format(a))

    # Our resource.
    cpus = max(sp.threads, 1)
    memory = int(total_memory * args.percentage / 100.0 * cpus / total_cpus)

    # For --form option.
    if args.form:
        print('-w{0}'.format(cpus))
        exit()

    # For --minos option.
    if args.minos:
        print('-m{0}x{1}'.format(total_cpus // cpus, cpus))
        exit()

    # Presumably increasing MaxTermSize requires increasing WorkSpace, too.
    sp.workspace = max(sp.workspace, sp.maxtermsize * 250)

    # Optimize the memory usage by bisection.
    max_iteration = 50

    sp0 = sp.copy()

    def f(x):
        # Hopefully monochrome increasing.
        sp = sp0.copy()
        sp.smallsize = int(sp.smallsize * x)
        sp.largesize = int(sp.largesize * x)
        sp.termsinsmall = int(sp.termsinsmall * x)
        sp.scratchsize = int(sp.scratchsize * x)
        m = sp.calc()
        if args.human_readable:
            m = round_human_readable(m, True, False)
        return (- (memory - m), sp)

    x1 = 1.0
    x2 = None
    y1 = f(x1)[0]
    y2 = None
    for _i in range(max_iteration):
        if x2 is None:
            if y1 < 0:
                x = x1 * 2.0
                y = f(x)[0]
                if y > 0:
                    x2 = x
                    y2 = y
                else:
                    x1 = x
                    y1 = y
            else:
                x = x1 * 0.5
                y = f(x)[0]
                if y < 0:
                    x2 = x1
                    y2 = y1
                    x1 = x
                    y1 = y
                else:
                    x1 = x
                    y1 = y
        else:
            x = (x1 + x2) * 0.5
            y = f(x)[0]
            if y < 0:
                x1 = x
                y1 = y
            else:
                x2 = x
                y2 = y
        if x2 is not None:
            assert x1 < x2 and y1 < y2

    if x2 is None:
        if x1 < 1.0e-12:
            x1 = 0
        parser.exit(('failed to find parameters: memory({0}) = {1} '
                     'bytes shortage').format(x1, y1))

    # For --usage option.
    if args.usage:
        m = f(x1)[1].calc()
        if args.human_readable:
            m = round_human_readable(m, True)
        print(m)
        exit()

    # Output.
    with open_w_or_stdout(args.output) as fi:
        def round_memory(m):
            return (round_human_readable(m, False)
                    if args.human_readable else m)

        print(('# {0}{1} (cpu: {2}, mem: {3}; '
               'total cpu: {4}, total mem: {5}; {6}x{7})').format(
            parser.prog,
            (' ' if len(sys.argv) >= 2 else '') + ' '.join(sys.argv[1:]),
            cpus,
            round_memory(memory),
            total_cpus,
            round_memory(total_memory),
            total_cpus // cpus,
            cpus,
        ), file=fi)

        sp = f(x1)[1]
        sp0 = Setup()  # default value
        dic0 = dict(sp0.items())
        for k, v in sp.items():
            if k == 'threads':
                # 'threads N' doesn't work, must be given by tform option -wN.
                continue
            if v == dic0[k]:
                # Don't write when same as the default value.
                continue
            if args.human_readable:
                v = round_human_readable(v, False)
            print('{0} {1}'.format(k, v), file=fi)
        for k, v in pars.items():
            print('{0} {1}'.format(k, v), file=fi)


if __name__ == '__main__':
    main()
	#!/bin/sh
	""":" .

	exec python "$0" "$@"
	"""

	from __future__ import print_function

	import argparse
	import contextlib
	import copy
	import math
	import os
	import re
	import subprocess
	import sys

	__doc__ = """\
	Generate form.set suited for the local machine.

	Example
	-------
	$ formset.py -o
	$ tform `formset.py -f` calcdia.frm
	$ minos `formset.py -m` minos.file

	Python versions
	---------------
	2.7, 3.2, 3.3, 3.4, 3.5
	"""


	if 'check_output' not in dir(subprocess):
	# For old systems where Python 2.6 + argparse available.
	def check_output(popenargs, *kwargs):
	"""Run a command."""
	if 'stdout' in kwargs: # pragma: no cover
	raise ValueError('stdout argument not allowed, '
	'it will be overridden.')
	process = subprocess.Popen(stdout=subprocess.PIPE,
	popenargs, *kwargs)
	output, _ = process.communicate()
	retcode = process.poll()
	if retcode:
	cmd = kwargs.get('args')
	if cmd is None:
	cmd = popenargs[0]
	# `output` keyword is not available in 2.6.
	raise subprocess.CalledProcessError(retcode, cmd)
	return output
	subprocess.check_output = check_output


	@contextlib.contextmanager
	def open_w_or_stdout(filename=None):
	"""Context manager for a file or stdout."""
	if filename:
	# See https://stackoverflow.com/a/2333979.
	tmpfilename = '{0}.tmp{1}'.format(filename, os.getpid())
	f = open(tmpfilename, 'w')
	try:
	yield f
	finally:
	f.flush()
	os.fsync(f.fileno())
	f.close()
	os.rename(tmpfilename, filename)
	else:
	yield sys.stdout


	def round_down(x, n):
	"""Round down `x` to nearest `n`."""
	return x // n * n


	def round_up(x, n):
	"""Round up `x` to nearest `n`."""
	return (x + (n - 1)) // n * n


	def metric_prefix(s):
	"""Parse a metric prefix as a number."""
	s = s.lower()
	if s == '':
	return 1
	if s == 'k':
	return 1000
	if s == 'm':
	return 1000**2
	if s == 'g':
	return 1000**3
	if s == 't':
	return 1000**4
	return None


	def parse_number(s):
	"""Parse a string as a number with a possible metric prefix."""
	scale = 1
	m = re.match(r'(.*)([kmgtKMGT])$', s)
	if m:
	s = m.group(1)
	scale = metric_prefix(m.group(2))
	# May raise ValueError.
	return int(float(s) * scale)


	def round_human_readable(x, up=False, tostring=True):
	"""Round off `x` within a human readable form."""
	round_off = round_up if up else round_down
	# Take 3 significant figures.
	n = 10**(int(math.floor(math.log10(x))) - 2)
	x = round_off(x, n)
	# Find a good suffix which doesn't change the value.
	xx = round_off(x, 1000**4)
	if xx == x:
	return '{0}T'.format(xx // 1000**4) if tostring else xx
	xx = round_off(x, 1000**3)
	if xx == x:
	return '{0}G'.format(xx // 1000**3) if tostring else xx
	xx = round_off(x, 1000**2)
	if xx == x:
	return '{0}M'.format(xx // 1000**2) if tostring else xx
	xx = round_off(x, 1000)
	if xx == x:
	return '{0}K'.format(xx // 1000) if tostring else xx
	return x


	class classproperty(property): # noqa
	"""Decorator to make a property of a class."""

	def __get__(self, cls, owner):
	"""Getter."""
	return classmethod(self.fget).__get__(None, owner)()


	class SystemInfo(object):
	"""System information."""

	_cpu_info = None
	_mem_info = None

	verbose = False

	@classproperty
	def number_of_nodes(cls): # noqa
	"""Return the number of nodes."""
	info = cls._get_cpu_info()
	if 'NUMA node(s)' in info:
	return int(info['NUMA node(s)'])
	else:
	return 1

	@classproperty
	def number_of_cpus(cls): # noqa
	"""Return the number of cpus."""
	info = cls._get_cpu_info()
	return int(info['CPU(s)'])

	@classproperty
	def number_of_physical_cores(cls): # noqa
	"""Return the number of physical cores."""
	info = cls._get_cpu_info()
	return int(info['Socket(s)']) * int(info['Core(s) per socket'])

	@classproperty
	def total_memory(cls): # noqa
	"""Return the total physical memory in bytes."""
	info = cls._get_mem_info()
	return int(info['Mem'][0])

	@classmethod
	def _get_cpu_info(cls):
	if cls._cpu_info is None:
	if cls.verbose:
	sys.stderr.write('running lscpu...\n')
	info = subprocess.check_output(['lscpu'])
	info = info.decode('utf-8')
	info = info.strip().split('\n')
	info = [[ss.strip() for ss in s.split(':')] for s in info]
	info = dict(info)
	cls._cpu_info = info
	return cls._cpu_info

	@classmethod
	def _get_mem_info(cls):
	if cls._mem_info is None:
	if cls.verbose:
	sys.stderr.write('running free...\n')
	info = subprocess.check_output(['free', '-b'])
	info = info.decode('utf-8')
	info = info.strip().split('\n')
	info = [[ss.strip() for ss in s.split(':')] for s in info]
	info = [s for s in info if len(s) == 2]
	info = [[s[0], s[1].split()] for s in info]
	info = dict(info)
	cls._mem_info = info
	return cls._mem_info


	class Setup(object):
	"""Setup parameters."""

	def __init__(self):
	"""Construct a set of setup parameters."""
	self.compresssize = 90000
	self.filepatches = 256
	self.hidesize = 0
	self.largepatches = 256
	self.largesize = 50000000
	self.maxtermsize = 40000 # 64-bit
	self.numstorecaches = 4
	self.scratchsize = 50000000
	self.sizestorecache = 32768
	self.smallextension = 20000000
	self.smallsize = 10000000
	self.sortiosize = 100000
	self.termsinsmall = 100000
	self.threadbucketsize = 500
	self.threads = -1 # form
	self.threadscratchoutsize = 2500000
	self.threadscratchsize = 100000
	self.workspace = 40000000 # 64-bit

	self.bracketindexsize = 200000
	self.constindex = 128
	self.continuationlines = 15
	self.functionlevels = 30
	self.maxnumbersize = 200
	self.maxwildcards = 100
	self.parentheses = 100
	self.processbucketsize = 1000
	self.subfilepatches = 64
	self.sublargepatches = 64
	self.sublargesize = 4000000
	self.subsmallextension = 800000
	self.subsmallsize = 500000
	self.subsortiosize = 32768
	self.subtermsinsmall = 10000

	# 64-bit
	self._ptrsize = 8
	self._possize = 8
	self._wordsize = 4

	def items(self):
	"""Return pairs of parameters and values."""
	items = [(k, v) for (k, v) in self.__dict__.items() if k[0] != '_']
	items.sort()
	return tuple(items)

	def __str__(self):
	"""Return the string representaiton."""
	mem = self.calc()
	params = ['{0}: {1}'.format(k, v) for (k, v) in self.items()]
	return '<Setup: {0} bytes, {1}>'.format(mem, ', '.join(params))

	def copy(self):
	"""Return a shallow copy."""
	return copy.copy(self)

	def calc(self):
	"""Return an estimation of memory usage."""
	self.maxtermsize = max(self.maxtermsize, 200)

	self.compresssize = max(self.compresssize,
	2 * self.maxtermsize * self._wordsize)
	self.sortiosize = max(self.sortiosize,
	self.maxtermsize * self._wordsize)

	# The strange factor WordSize**2 is used in the FORM source...
	self.scratchsize = max(self.scratchsize,
	4 * self.maxtermsize * self._wordsize**2)
	if self.hidesize > 0:
	self.hidesize = max(self.hidesize,
	4 * self.maxtermsize * self._wordsize**2)

	self.threadscratchsize = max(self.threadscratchsize,
	4 * self.maxtermsize * self._wordsize**2)
	self.threadscratchoutsize = max(self.threadscratchoutsize,
	4 * self.maxtermsize *
	self._wordsize**2)

	# constraints in RecalcSetups()

	self.filepatches = max(self.filepatches, self.threads)

	self.termsinsmall = round_up(self.termsinsmall, 16)

	numberofblocksinsort = 10
	minimumnumberofterms = 10
	n = numberofblocksinsort * minimumnumberofterms
	if self.threads >= 0:
	minbufsize = (self.threads * (1 + n) * self.maxtermsize *
	self._wordsize)
	if self.largesize + self.smallextension < minbufsize:
	self.largesize = minbufsize - self.smallextension

	# constraints in AllocSort()

	self.filepatches = max(self.filepatches, 4)

	self.smallsize = max(self.smallsize,
	16 * self.maxtermsize * self._wordsize)

	self.smallextension = max(self.smallextension, self.smallsize * 3 // 2)

	if self.largesize > 0:
	self.largesize = max(self.largesize, 2 * self.smallsize)

	compinc = 2
	minbufsize = self.filepatches * (self.sortiosize +
	(compinc + 2 * self.maxtermsize) *
	self._wordsize)
	if self.largesize + self.smallextension < minbufsize:
	if self.largesize == 0:
	self.smallextension = minbufsize
	else:
	self.largesize = minbufsize - self.smallextension

	iotry = (((self.largesize + self.smallextension) // self.filepatches //
	self._wordsize) - 2 * self.maxtermsize - compinc) # in words
	self.sortiosize = max(self.sortiosize, iotry) # bytes vs. words??

	# Compute the memory usage.

	mem = 0
	mem += (self.scratchsize * 2 + (self.hidesize
	if self.hidesize > 0
	else self.scratchsize))
	mem += self.workspace * self._wordsize
	mem += (self.compresssize + 10) * self._wordsize
	mem += (self.largesize + self.smallextension + 3 * self.termsinsmall *
	self._ptrsize + self.sortiosize)

	storecachesize = self._possize * 2 * self._ptrsize + self._wordsize
	# ignore the padding
	storecachesize += self.sizestorecache
	mem += storecachesize * self.numstorecaches

	if self.threads >= 1:
	mem += ((self.threadscratchoutsize + self.threadscratchsize * 2) *
	self.threads)
	mem += self.workspace * self._wordsize * self.threads
	mem += (self.compresssize + 10) * self._wordsize * self.threads

	mem += self._thread_alloc_sort(self.largesize // self.threads,
	self.smallsize // self.threads,
	self.smallextension // self.threads,
	self.termsinsmall,
	self.largepatches,
	self.filepatches // self.threads,
	self.sortiosize) * self.threads

	mem += storecachesize * self.numstorecaches * self.threads

	sizethreadbuckets = ((self.threadbucketsize + 1) *
	self.maxtermsize + 2) * self._wordsize
	if self.threadbucketsize >= 250:
	sizethreadbuckets //= 4
	elif self.threadbucketsize >= 90:
	sizethreadbuckets //= 3
	elif self.threadbucketsize >= 40:
	sizethreadbuckets //= 2
	sizethreadbuckets //= self._wordsize
	mem += ((2 * sizethreadbuckets * self._wordsize +
	(self.threadbucketsize + 1) * self._possize) *
	2 * self.threads)
	if self.threads >= 3:
	mem += ((self.workspace * self._wordsize // 8 +
	2 * self.maxtermsize * self._wordsize) *
	(self.threads - 2))

	return mem

	def _thread_alloc_sort(self, largesize, smallsize, smallextension,
	termsinsmall, largepatches, filepatches,
	sortiosize):

	filepatches = max(filepatches, 4)

	smallsize = max(smallsize, 16 * self.maxtermsize * self._wordsize)

	smallextension = max(smallextension, smallsize * 3 // 2)

	if largesize > 0:
	largesize = max(largesize, 2 * smallsize)

	compinc = 2
	minbufsize = filepatches * (sortiosize + (compinc +
	2 * self.maxtermsize) * self._wordsize)
	if largesize + smallextension < minbufsize:
	if largesize == 0:
	smallextension = minbufsize
	else:
	largesize = minbufsize - smallextension

	iotry = (((largesize + smallextension) // filepatches //
	self._wordsize) - 2 * self.maxtermsize - compinc) # in words
	sortiosize = max(sortiosize, iotry) # bytes vs. words??

	return (largesize + smallextension + 3 * termsinsmall * self._ptrsize +
	sortiosize)


	def main():
	"""Entry point."""
	# Parse the command line arguments.
	parser = argparse.ArgumentParser(
	usage=('%(prog)s [options] [--] '
	'[par=val].. [par+=int].. [par*=float]..'),
	epilog=('On non-Linux systems, the number of physical CPUs and memory '
	'available on the machine may be not automatically detected. '
	'In such a case, one cannot use the default parameters '
	'depending on those values and needs to explicitly specify '
	'--ncpus, --total-cpus and --total-memory.'),
	add_help=False
	)
	parser.add_argument('-h',
	'--help',
	action='store_const',
	const=True,
	help='show this help message and exit')
	parser.add_argument('-o',
	'--output',
	action='store',
	nargs='?',
	const='form.set',
	help=('output to FILE (default: no (stdout), '
	'FILE=form.set)'),
	metavar='FILE')
	parser.add_argument('-f',
	'--form',
	action='store_const',
	const=True,
	help='print tform options (e.g., -w4) and exit')
	parser.add_argument('-m',
	'--minos',
	action='store_const',
	const=True,
	help='print minos options (e.g., -m2x4) and exit')
	parser.add_argument('-u',
	'--usage',
	action='store_const',
	const=True,
	help='print expected initial memory usage and exit')
	parser.add_argument('-H',
	'--human-readable',
	action='store_const',
	const=True,
	help=('adjust to human-readable numbers '
	'(e.g., 1K, 23M, 456G)'))
	parser.add_argument('-1',
	'--one',
	action='store_const',
	const=-1,
	dest='ncpus',
	help='use cpus in a node on the machine (default)')
	parser.add_argument('--full',
	action='store_const',
	const=-99999,
	dest='ncpus',
	help='use cpus in all nodes on the machine')
	parser.add_argument('-n',
	'--ncpus',
	action='store',
	type=int,
	help='use N cpus',
	metavar='N')
	parser.add_argument('-p',
	'--percentage',
	action='store',
	default=75.0,
	type=float,
	help=('percentage of initial memory usage '
	'(default: 75.0)'),
	metavar='N')
	parser.add_argument('--total-cpus',
	action='store',
	type=int,
	help='specify the total cpus on the machine',
	metavar='N')
	parser.add_argument('--total-memory',
	action='store',
	help='specify the total memory on the machine',
	metavar='N')
	parser.add_argument('-v',
	'--verbose',
	action='store_const',
	const=True,
	help='verbose output')
	parser.add_argument('args',
	nargs='*',
	help=argparse.SUPPRESS)
	args = parser.parse_args()
	pars = {}

	# NOTE: when all of `--ncpus`, `--total-cpus` and `--total-memory` are
	# specified, we don't need to access the system information.

	if args.verbose:
	SystemInfo.verbose = True

	if args.total_cpus:
	total_cpus = args.total_cpus
	else:
	total_cpus = SystemInfo.number_of_physical_cores

	if args.total_memory:
	try:
	total_memory = parse_number(args.total_memory)
	except ValueError:
	parser.error('non-integer value for total memory: {0}'.format(
	args.total_memory))
	else:
	total_memory = SystemInfo.total_memory

	# Help message.
	if args.help:
	parser.print_help()
	exit(0)

	# Number of CPUs.
	if args.ncpus is not None:
	ncpus = args.ncpus
	else:
	# Use 1 node for each job by default.
	ncpus = -1
	if ncpus < 0:
	# Use (-ncpus) nodes.
	ncpus = -ncpus * (total_cpus // SystemInfo.number_of_nodes)
	ncpus = max(ncpus, 1)
	ncpus = min(ncpus, total_cpus)

	sp = Setup()
	sp.threads = ncpus if ncpus >= 2 else -1

	for a in args.args:
	m = re.match(r'([a-zA-Z][a-zA-Z0-9])([+]?)=(.*)', a)
	if m:
	par = m.group(1).lower()
	ope = m.group(2)
	val = m.group(3)
	if par in sp.__dict__:
	# Known parameter.
	if ope == '' or ope == '+':
	# We have par=val or par+=int.
	try:
	val = parse_number(val)
	except ValueError:
	parser.error(
	'non-integer value for parameter: {0}'.format(a))
	if ope == '':
	setattr(sp, par, val)
	else:
	setattr(sp, par, getattr(sp, par) + val)
	continue
	else:
	# We have par*=float.
	try:
	val = float(val)
	except ValueError:
	parser.error(
	'non-float value for parameter: {0}'.format(a))
	setattr(sp, par, int(getattr(sp, par) * val))
	continue
	elif ope == '':
	# Unknown parameter given by par=val. Add it to the dictionary.
	pars[par] = val
	continue
	parser.error('unrecognized argument: {0}'.format(a))

	# Our resource.
	cpus = max(sp.threads, 1)
	memory = int(total_memory * args.percentage / 100.0 * cpus / total_cpus)

	# For --form option.
	if args.form:
	print('-w{0}'.format(cpus))
	exit()

	# For --minos option.
	if args.minos:
	print('-m{0}x{1}'.format(total_cpus // cpus, cpus))
	exit()

	# Presumably increasing MaxTermSize requires increasing WorkSpace, too.
	sp.workspace = max(sp.workspace, sp.maxtermsize * 250)

	# Optimize the memory usage by bisection.
	max_iteration = 50

	sp0 = sp.copy()

	def f(x):
	# Hopefully monochrome increasing.
	sp = sp0.copy()
	sp.smallsize = int(sp.smallsize * x)
	sp.largesize = int(sp.largesize * x)
	sp.termsinsmall = int(sp.termsinsmall * x)
	sp.scratchsize = int(sp.scratchsize * x)
	m = sp.calc()
	if args.human_readable:
	m = round_human_readable(m, True, False)
	return (- (memory - m), sp)

	x1 = 1.0
	x2 = None
	y1 = f(x1)[0]
	y2 = None
	for _i in range(max_iteration):
	if x2 is None:
	if y1 < 0:
	x = x1 * 2.0
	y = f(x)[0]
	if y > 0:
	x2 = x
	y2 = y
	else:
	x1 = x
	y1 = y
	else:
	x = x1 * 0.5
	y = f(x)[0]
	if y < 0:
	x2 = x1
	y2 = y1
	x1 = x
	y1 = y
	else:
	x1 = x
	y1 = y
	else:
	x = (x1 + x2) * 0.5
	y = f(x)[0]
	if y < 0:
	x1 = x
	y1 = y
	else:
	x2 = x
	y2 = y
	if x2 is not None:
	assert x1 < x2 and y1 < y2

	if x2 is None:
	if x1 < 1.0e-12:
	x1 = 0
	parser.exit(('failed to find parameters: memory({0}) = {1} '
	'bytes shortage').format(x1, y1))

	# For --usage option.
	if args.usage:
	m = f(x1)[1].calc()
	if args.human_readable:
	m = round_human_readable(m, True)
	print(m)
	exit()

	# Output.
	with open_w_or_stdout(args.output) as fi:
	def round_memory(m):
	return (round_human_readable(m, False)
	if args.human_readable else m)

	print(('# {0}{1} (cpu: {2}, mem: {3}; '
	'total cpu: {4}, total mem: {5}; {6}x{7})').format(
	parser.prog,
	(' ' if len(sys.argv) >= 2 else '') + ' '.join(sys.argv[1:]),
	cpus,
	round_memory(memory),
	total_cpus,
	round_memory(total_memory),
	total_cpus // cpus,
	cpus,
	), file=fi)

	sp = f(x1)[1]
	sp0 = Setup() # default value
	dic0 = dict(sp0.items())
	for k, v in sp.items():
	if k == 'threads':
	# 'threads N' doesn't work, must be given by tform option -wN.
	continue
	if v == dic0[k]:
	# Don't write when same as the default value.
	continue
	if args.human_readable:
	v = round_human_readable(v, False)
	print('{0} {1}'.format(k, v), file=fi)
	for k, v in pars.items():
	print('{0} {1}'.format(k, v), file=fi)


	if __name__ == '__main__':
	main()